From 009007120becd6644ad302fc096624681958b9c2 Mon Sep 17 00:00:00 2001 From: shugeo Date: Thu, 28 Nov 2019 20:08:51 +0200 Subject: [PATCH] Shugeo_release_fixes3 (#81) * Implementation for non_max_suppression_v3 was added. Initial version * Added check for overcome threshold. * Added definition for V3 method. * java remapping for NonMaxSuppressionV3 Signed-off-by: raver119 * Fixed proporly processing of an empty output and test. * Refactored op to less threshold data to float. * Implemented cuda-based helper for non_max_suppression_v3 op. * Fixed fake_quant_with_min_max_vars op. * Fixed tests with float numbers. * - assert now stops execution - sortByKey/sortByValue now have input validation Signed-off-by: raver119 * missing var Signed-off-by: raver119 * Fixed proper processing for zero max_size inputs. * Refactored kernel callers. * Fixed return statement for logdet op helper. * Refactored unsorted segment SqrtN op. * get back 8 tail bytes on CUDA Signed-off-by: raver119 * Refactored segment prod ops and helpers for cuda and tests. * Additional test. * CudaWorkspace tests updated for 8 tail bytes Signed-off-by: raver119 * special atomic test Signed-off-by: raver119 * atomicMul/atomicDiv fix for 16bit values Signed-off-by: raver119 * Eliminated waste prints. --- libnd4j/blas/cuda/NativeOps.cu | 14 + libnd4j/include/op_boilerplate.h | 6 +- .../declarable/generic/parity_ops/assert.cpp | 4 +- .../parity_ops/non_max_suppression.cpp | 149 ++++++++-- .../parity_ops/unsorted_segment_prod.cpp | 12 +- .../ops/declarable/headers/parity_ops.h | 5 +- .../helpers/cpu/fake_quantization.cpp | 11 +- .../helpers/cpu/image_suppression.cpp | 77 ++++- .../helpers/cuda/fake_quantization.cu | 2 +- .../helpers/cuda/image_suppression.cu | 92 ++++-- .../ops/declarable/helpers/cuda/lup.cu | 41 ++- .../declarable/helpers/cuda/segment_prod.cu | 146 ++++------ .../declarable/helpers/cuda/segment_sqrtn.cu | 106 +++---- .../declarable/helpers/image_suppression.h | 2 + libnd4j/include/templatemath.h | 160 ++++------- libnd4j/tests_cpu/layers_tests/AtomicTests.cu | 86 ++++++ .../layers_tests/DeclarableOpsTests10.cpp | 272 ++++++++++++------ .../layers_tests/DeclarableOpsTests6.cpp | 2 - .../layers_tests/DeclarableOpsTests7.cpp | 93 +++++- .../tests_cpu/layers_tests/WorkspaceTests.cu | 4 +- .../converters/ImportClassMapping.java | 1 + .../api/ops/impl/image/NonMaxSuppression.java | 2 +- .../ops/impl/image/NonMaxSuppressionV3.java | 79 +++++ .../java/org/nd4j/nativeblas/Nd4jCpu.java | 19 +- 24 files changed, 945 insertions(+), 440 deletions(-) create mode 100644 libnd4j/tests_cpu/layers_tests/AtomicTests.cu create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppressionV3.java diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 024012808..c8cb3a616 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -2202,10 +2202,17 @@ void sortByKey(Nd4jPointer *extraPointers, auto stream = reinterpret_cast(extraPointers[1]); auto xLength = shape::length(xShapeInfo); + auto yLength = shape::length(yShapeInfo); auto xEWS = shape::elementWiseStride(xShapeInfo); auto xType = nd4j::ArrayOptions::dataType(xShapeInfo); auto yType = nd4j::ArrayOptions::dataType(yShapeInfo); + if (shape::isEmpty(xShapeInfo) || shape::isEmpty(yShapeInfo)) + return; + + if (xLength != yLength) + throw std::runtime_error("sortByKey: keys and values must have the same size"); + // check if xLength is a power of 2, and use bitonic sort, if that's the case if ((xLength != 0) && ((xLength & (xLength - 1)) == 0) && (xLength <= 1024 * 1024 * 10)) { @@ -2269,10 +2276,17 @@ void sortByValue(Nd4jPointer *extraPointers, auto stream = reinterpret_cast(extraPointers[1]); auto xLength = shape::length(xShapeInfo); + auto yLength = shape::length(yShapeInfo); auto xEWS = shape::elementWiseStride(xShapeInfo); auto xType = nd4j::ArrayOptions::dataType(yShapeInfo); auto yType = nd4j::ArrayOptions::dataType(xShapeInfo); + if (shape::isEmpty(xShapeInfo) || shape::isEmpty(yShapeInfo)) + return; + + if (xLength != yLength) + throw std::runtime_error("sortByValue: keys and values must have the same size"); + // check if xLength is a power of 2, and use bitonic sort, if that's the case if ((xLength != 0) && ((xLength & (xLength - 1)) == 0) && (xLength <= 1024 * 1024 * 10)) { diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h index 102a1776a..4a6561f3b 100644 --- a/libnd4j/include/op_boilerplate.h +++ b/libnd4j/include/op_boilerplate.h @@ -1461,12 +1461,14 @@ #ifdef _RELEASE -#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); } +// we intentionally add 8 tail bytes here to avoid problems with atomic operations +#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); } #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast(VARIABLE)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; }; #else -#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { nd4j::memory::MemoryTracker::getInstance()->countIn(nd4j::memory::MemoryType::DEVICE, VARIABLE, LENGTH * sizeof(TT)); }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); } +// we intentionally add 8 tail bytes here to avoid problems with atomic operations +#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { nd4j::memory::MemoryTracker::getInstance()->countIn(nd4j::memory::MemoryType::DEVICE, VARIABLE, LENGTH * sizeof(TT)); }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); } #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { nd4j::memory::MemoryTracker::getInstance()->countOut(VARIABLE); auto erc_##VARIABLE = cudaFree(reinterpret_cast(VARIABLE)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; }; #endif diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp index da61e5a65..cd8d50df2 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp @@ -29,8 +29,8 @@ namespace nd4j { OP_IMPL(Assert, 1, 1, false) { auto x = INPUT_VARIABLE(0); - if (x->e(0) == 0.0f) { - nd4j_printf("Assertion failed for node [%i]\n", block.getNodeId()); + if (!x->e(0)) { + REQUIRE_TRUE(false, 0, "Assertion failed for node [%i]\n", block.getNodeId()); } return Status::OK(); diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp index e07a05de5..e0812c403 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp @@ -21,10 +21,10 @@ #include #include -#if NOT_EXCLUDED(OP_image_non_max_suppression) namespace nd4j { namespace ops { +#if NOT_EXCLUDED(OP_image_non_max_suppression) CUSTOM_OP_IMPL(non_max_suppression, 2, 1, false, 0, 0) { auto boxes = INPUT_VARIABLE(0); auto scales = INPUT_VARIABLE(1); @@ -56,11 +56,24 @@ namespace nd4j { if (boxes->isEmpty() || scales->isEmpty()) return Status::OK(); - REQUIRE_TRUE(boxes->rankOf() == 2, 0, "image.non_max_suppression: The rank of boxes array should be 2, but %i is given", boxes->rankOf()); - REQUIRE_TRUE(boxes->sizeAt(1) == 4, 0, "image.non_max_suppression: The last dimension of boxes array should be 4, but %i is given", boxes->sizeAt(1)); - REQUIRE_TRUE(scales->rankOf() == 1 && scales->lengthOf() == boxes->sizeAt(0), 0, "image.non_max_suppression: The rank of scales array should be 1, but %i is given", boxes->rankOf()); + if (output->isEmpty()) + return Status::OK(); - helpers::nonMaxSuppression(block.launchContext(), boxes, scales, maxOutputSize, overlayThreshold, scoreThreshold, output); + REQUIRE_TRUE(boxes->rankOf() == 2, 0, "image.non_max_suppression: The rank of boxes array should be 2, " + "but %i is given", boxes->rankOf()); + REQUIRE_TRUE(boxes->sizeAt(1) == 4, 0, "image.non_max_suppression: The last dimension of boxes array " + "should be 4, but %i is given", boxes->sizeAt(1)); + REQUIRE_TRUE(scales->rankOf() == 1 && scales->lengthOf() == boxes->sizeAt(0), 0, + "image.non_max_suppression: The rank of scales array should be 1, but %i is given", boxes->rankOf()); + REQUIRE_TRUE(overlayThreshold >= 0. && overlayThreshold <= 1., 0, "image.non_max_suppressio: The overlay " + "threashold should be in [0, 1], but " + "%lf is given.", overlayThreshold); + REQUIRE_TRUE(boxes->dataType() == scales->dataType(), 0, + "image.non_max_suppression: Boxes and scores inputs should have the same data type, but %s and %s " + "were given.", DataTypeUtils::asString(boxes->dataType()).c_str(), + DataTypeUtils::asString(scales->dataType()).c_str()); + helpers::nonMaxSuppression(block.launchContext(), boxes, scales, maxOutputSize, overlayThreshold, + scoreThreshold, output); return Status::OK(); } @@ -77,20 +90,22 @@ namespace nd4j { else REQUIRE_TRUE(false, 0, "image.non_max_suppression: Max output size argument cannot be retrieved."); - auto actualIndicesCount = shape::sizeAt(in, 0); - if (block.getTArguments()->size() > 1 || block.width() > 4) { - auto scoreThreshold = block.getTArguments()->size() > 1?T_ARG(1):INPUT_VARIABLE(4)->e(0); - auto scales = INPUT_VARIABLE(1); - scales->syncToHost(); - for (auto e = 0; e < scales->lengthOf(); e++) { - if (scales->e(e) < (float)scoreThreshold) { - actualIndicesCount--; + if (maxOutputSize > 0) { + auto actualIndicesCount = shape::sizeAt(in, 0); + if (block.getTArguments()->size() > 1 || block.width() > 4) { + auto scoreThreshold = + block.getTArguments()->size() > 1 ? T_ARG(1) : INPUT_VARIABLE(4)->e(0); + auto scales = INPUT_VARIABLE(1); + scales->syncToHost(); + for (auto e = 0; e < scales->lengthOf(); e++) { + if (scales->e(e) < (float) scoreThreshold) { + actualIndicesCount--; + } } } + if (actualIndicesCount < maxOutputSize) + maxOutputSize = actualIndicesCount; } - if (actualIndicesCount < maxOutputSize) - maxOutputSize = actualIndicesCount; - outputShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(maxOutputSize, DataType::INT32); return SHAPELIST(outputShape); @@ -100,7 +115,107 @@ namespace nd4j { ->setAllowedInputTypes(nd4j::DataType::ANY) ->setAllowedOutputTypes({ALL_INDICES}); } +#endif +#if NOT_EXCLUDED(OP_image_non_max_suppression_v3) + DECLARE_TYPES(non_max_suppression_v3) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setAllowedOutputTypes({ALL_INDICES}); + } + + CUSTOM_OP_IMPL(non_max_suppression_v3, 2, 1, false, 0, 0) { + auto boxes = INPUT_VARIABLE(0); + auto scales = INPUT_VARIABLE(1); + auto output = OUTPUT_VARIABLE(0); + int maxOutputSize; // = INT_ARG(0); + if (block.width() > 2) + maxOutputSize = INPUT_VARIABLE(2)->e(0); + else if (block.getIArguments()->size() == 1) + maxOutputSize = INT_ARG(0); + else + REQUIRE_TRUE(false, 0, "image.non_max_suppression: Max output size argument cannot be retrieved."); + + double overlayThreshold = 0.5; + double scoreThreshold = - DataTypeUtils::infOrMax(); + + if (block.width() > 3) { + overlayThreshold = INPUT_VARIABLE(3)->e(0); + } + else if (block.getTArguments()->size() > 0) { + overlayThreshold = T_ARG(0); + } + + if (block.width() > 4) { + scoreThreshold = INPUT_VARIABLE(4)->e(0); + } + else if (block.getTArguments()->size() > 1) { + scoreThreshold = T_ARG(1); + } + if (boxes->isEmpty() || scales->isEmpty()) + return Status::OK(); + if (output->isEmpty()) + return Status::OK(); + + REQUIRE_TRUE(boxes->rankOf() == 2, 0, "image.non_max_suppression: The rank of boxes array should be 2, but " + "%i is given", boxes->rankOf()); + REQUIRE_TRUE(boxes->sizeAt(1) == 4, 0, "image.non_max_suppression: The last dimension of boxes array should " + "be 4, but %i is given", boxes->sizeAt(1)); + REQUIRE_TRUE(scales->rankOf() == 1 && scales->lengthOf() == boxes->sizeAt(0), 0, + "image.non_max_suppression: The rank of scales array should be 1, but %i is given", boxes->rankOf()); + REQUIRE_TRUE(overlayThreshold >= 0. && overlayThreshold <= 1., 0, + "image.non_max_suppression_v3: The overlay threashold should be in [0, 1], but %lf given.", + overlayThreshold); + REQUIRE_TRUE(boxes->dataType() == scales->dataType(), 0, + "image.non_max_suppression_v3: Boxes and scores inputs should have the same data type, but %s and %s " + "were given.", DataTypeUtils::asString(boxes->dataType()).c_str(), + DataTypeUtils::asString(scales->dataType()).c_str()); + + helpers::nonMaxSuppressionV3(block.launchContext(), boxes, scales, maxOutputSize, overlayThreshold, + scoreThreshold, output); + return Status::OK(); + } + + DECLARE_SHAPE_FN(non_max_suppression_v3) { + auto in = inputShape->at(0); + int outRank = shape::rank(in); + Nd4jLong *outputShape = nullptr; + + int maxOutputSize; + if (block.width() > 2) + maxOutputSize = INPUT_VARIABLE(2)->e(0); + else if (block.getIArguments()->size() == 1) + maxOutputSize = INT_ARG(0); + else + REQUIRE_TRUE(false, 0, "image.non_max_suppression: Max output size argument cannot be retrieved."); + auto boxes = INPUT_VARIABLE(0); + auto scales = INPUT_VARIABLE(1); + + double overlayThreshold = 0.5; + double scoreThreshold = - DataTypeUtils::infOrMax(); + + if (block.width() > 3) { + overlayThreshold = INPUT_VARIABLE(3)->e(0); + } + else if (block.getTArguments()->size() > 0) { + overlayThreshold = T_ARG(0); + } + + if (block.width() > 4) { + scoreThreshold = INPUT_VARIABLE(4)->e(0); + } + else if (block.getTArguments()->size() > 1) { + scoreThreshold = T_ARG(1); + } + + auto len = maxOutputSize; + if (len > 0) + len = helpers::nonMaxSuppressionV3(block.launchContext(), boxes, scales, maxOutputSize, overlayThreshold, scoreThreshold, nullptr); + + outputShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(len, DataType::INT32); + + return SHAPELIST(outputShape); + } +#endif } } -#endif diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp index 56ffcbd69..695207c4b 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp @@ -61,9 +61,9 @@ namespace nd4j { } DECLARE_TYPES(unsorted_segment_prod) { getOpDescriptor() - ->setAllowedOutputTypes({ALL_FLOATS}) - ->setAllowedInputTypes(0, {ALL_FLOATS}) - ->setAllowedInputTypes(1, {ALL_INTS}) + ->setAllowedOutputTypes({ALL_FLOATS, ALL_INTS}) + ->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS}) + ->setAllowedInputTypes(1, {ALL_INDICES}) ->setSameMode(false); } @@ -88,10 +88,10 @@ namespace nd4j { DECLARE_TYPES(unsorted_segment_prod_bp) { getOpDescriptor() ->setAllowedOutputTypes(0, {ALL_FLOATS}) - ->setAllowedOutputTypes(1, {ALL_INTS}) + ->setAllowedOutputTypes(1, {ALL_INDICES}) ->setAllowedInputTypes(0, {ALL_FLOATS}) - ->setAllowedInputTypes(1, {ALL_INTS}) - ->setAllowedInputTypes(2,{ALL_FLOATS}) + ->setAllowedInputTypes(1, {ALL_INDICES}) + ->setAllowedInputTypes(2,{ALL_FLOATS, ALL_INTS}) ->setSameMode(false); } diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h index f6ed43f47..0d354630c 100644 --- a/libnd4j/include/ops/declarable/headers/parity_ops.h +++ b/libnd4j/include/ops/declarable/headers/parity_ops.h @@ -1723,7 +1723,7 @@ namespace nd4j { #endif /** - * image.non_max_suppression op. + * image.non_max_suppression ops. * input: * 0 - boxes - 2D-tensor with shape (num_boxes, 4) by float type * 1 - scales - 1D-tensor with shape (num_boxes) by float type @@ -1741,6 +1741,9 @@ namespace nd4j { #if NOT_EXCLUDED(OP_image_non_max_suppression) DECLARE_CUSTOM_OP(non_max_suppression, 2, 1, false, 0, 0); #endif + #if NOT_EXCLUDED(OP_image_non_max_suppression_v3) + DECLARE_CUSTOM_OP(non_max_suppression_v3, 2, 1, false, 0, 0); + #endif /* * image.non_max_suppression_overlaps op. diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp index d55e4f6dd..81f00b066 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp @@ -77,7 +77,13 @@ namespace helpers { } } } - +// +//const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min); +// const auto clamped_shifted = clamped - nudged_min; +// outputs.device(d) = (clamped_shifted / nudged_scale_repl + 0.5f).floor() * +// nudged_scale_repl + +// nudged_min; +// template void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) { int lowIntBound = narrowed ? 1 : 0; @@ -95,7 +101,8 @@ namespace helpers { else if (val > nudgedMax) val = nudgedMax; // converse value with scale and shifted with nudged min - return (nd4j::math::nd4j_floor((val - nudgedMin)/scale + T(0.5)) * scale + nudgedMin); + val -= nudgedMin; + return (nd4j::math::nd4j_floor(val / scale + T(0.5f)) * scale + nudgedMin); }; input->applyLambda(fakeQuantizationWithMinMax, output); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp index 9814821b2..bf5acd01e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp @@ -19,7 +19,7 @@ // #include -//#include +#include #include #include #include @@ -90,17 +90,61 @@ namespace helpers { } } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Return intersection-over-union overlap between boxes i and j + template + static inline T similirityV3_(NDArray const& boxes, Nd4jLong i, Nd4jLong j) { + const T zero = static_cast(0.f); + const T yminI = math::nd4j_min(boxes.t(i, 0), boxes.t(i, 2)); + const T xminI = math::nd4j_min(boxes.t(i, 1), boxes.t(i, 3)); + const T ymaxI = math::nd4j_max(boxes.t(i, 0), boxes.t(i, 2)); + const T xmaxI = math::nd4j_max(boxes.t(i, 1), boxes.t(i, 3)); + const T yminJ = math::nd4j_min(boxes.t(j, 0), boxes.t(j, 2)); + const T xminJ = math::nd4j_min(boxes.t(j, 1), boxes.t(j, 3)); + const T ymaxJ = math::nd4j_max(boxes.t(j, 0), boxes.t(j, 2)); + const T xmaxJ = math::nd4j_max(boxes.t(j, 1), boxes.t(j, 3)); + const T areaI = (ymaxI - yminI) * (xmaxI - xminI); + const T areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ); + if (areaI <= zero || areaJ <= zero) { + return zero; + } + const T intersectionYmin = math::nd4j_max(yminI, yminJ); + const T intersectionXmin = math::nd4j_max(xminI, xminJ); + const T intersectionYmax = math::nd4j_min(ymaxI, ymaxJ); + const T intersectionXmax = math::nd4j_min(xmaxI, xmaxJ); + const T intersectionY = intersectionYmax - intersectionYmin; + const T intersectionX = intersectionXmax - intersectionXmin; + const T intersectionArea = math::nd4j_max(intersectionY, zero) * math::nd4j_max(intersectionX, zero); + return intersectionArea / (areaI + areaJ - intersectionArea); + } + + template + static inline T similiratyOverlaps_(NDArray const& boxes, Nd4jLong i, Nd4jLong j) { + return boxes.t(i, j); + } + + typedef NDArray (*SimiliratyFunc)(NDArray const& boxes, Nd4jLong i, Nd4jLong j); + + static NDArray similiratyOverlaps(NDArray const& boxes, Nd4jLong i, Nd4jLong j) { + NDArray res(boxes.dataType(), boxes.getContext()); // = NDArrayFactory::create(0.); + BUILD_SINGLE_SELECTOR(boxes.dataType(), res = similiratyOverlaps_, (boxes, i, j) , FLOAT_TYPES); + return res; + } + + static NDArray similiratyV3(NDArray const& boxes, Nd4jLong i, Nd4jLong j) { + NDArray res(boxes.dataType(), boxes.getContext()); // = NDArrayFactory::create(0.); + BUILD_SINGLE_SELECTOR(boxes.dataType(), res = similirityV3_, (boxes, i, j) , FLOAT_TYPES); + return res; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template static Nd4jLong nonMaxSuppressionGeneric_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize, - double overlapThreshold, double scoreThreshold, NDArray* output) { + float overlapThreshold, float scoreThreshold, NDArray* output, SimiliratyFunc f) { -// const int outputSize = maxSize->e(0); auto numBoxes = boxes->sizeAt(0); - //std::vector scoresData(numBoxes); T* scoresData = scores->dataBuffer()->primaryAsT(); - //std::copy_n(scores->getDataBuffer()->primaryAsT(), numBoxes, scoresData.begin()); // Data structure for a selection candidate in NMS. struct Candidate { @@ -113,9 +157,10 @@ namespace helpers { return ((bsI._score == bsJ._score) && (bsI._boxIndex > bsJ._boxIndex)) || (bsI._score < bsJ._score); }; + std::priority_queue, decltype(cmp)> candidatePriorityQueue(cmp); for (auto i = 0; i < scores->lengthOf(); ++i) { - if (scoresData[i] > scoreThreshold) { + if ((float)scoresData[i] > (float)scoreThreshold) { candidatePriorityQueue.emplace(Candidate({i, scoresData[i], 0})); } } @@ -139,17 +184,18 @@ namespace helpers { // following loop. bool shouldHardSuppress = false; for (int j = static_cast(selected.size()) - 1; j >= nextCandidate._suppressBeginIndex; --j) { - similarity = boxes->t(nextCandidate._boxIndex, selected[j]); + auto similarityA = f(*boxes, nextCandidate._boxIndex, selected[j]); //boxes->t(nextCandidate._boxIndex, selected[j]); + similarity = similarityA.template t(0); nextCandidate._score *= T(similarity <= overlapThreshold?1.0:0.); //suppressWeightFunc(similarity); // First decide whether to perform hard suppression - if (similarity >= static_cast(overlapThreshold)) { + if ((float)similarity >= static_cast(overlapThreshold)) { shouldHardSuppress = true; break; } // If next_candidate survives hard suppression, apply soft suppression - if (nextCandidate._score <= scoreThreshold) break; + if ((float)nextCandidate._score <= (float)scoreThreshold) break; } // If `nextCandidate._score` has not dropped below `scoreThreshold` // by this point, then we know that we went through all of the previous @@ -169,7 +215,7 @@ namespace helpers { selected.push_back(nextCandidate._boxIndex); // selected_scores.push_back(nextCandidate._score); } - if (nextCandidate._score > scoreThreshold) { + if ((float)nextCandidate._score > (float)scoreThreshold) { // Soft suppression has occurred and current score is still greater than // score_threshold; add next_candidate back onto priority queue. candidatePriorityQueue.push(nextCandidate); @@ -188,12 +234,19 @@ namespace helpers { Nd4jLong nonMaxSuppressionGeneric(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize, double overlapThreshold, double scoreThreshold, NDArray* output) { - BUILD_DOUBLE_SELECTOR(boxes->dataType(), output == nullptr?DataType::INT32:output->dataType(), return nonMaxSuppressionGeneric_, (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output), FLOAT_TYPES, INTEGER_TYPES); + BUILD_DOUBLE_SELECTOR(boxes->dataType(), output == nullptr?DataType::INT32:output->dataType(), return nonMaxSuppressionGeneric_, (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, similiratyOverlaps), FLOAT_TYPES, INTEGER_TYPES); + return 0; + } + + Nd4jLong + nonMaxSuppressionV3(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize, + double overlapThreshold, double scoreThreshold, NDArray* output) { + BUILD_DOUBLE_SELECTOR(boxes->dataType(), output == nullptr?DataType::INT32:output->dataType(), return nonMaxSuppressionGeneric_, (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, similiratyV3), FLOAT_TYPES, INTEGER_TYPES); return 0; } BUILD_DOUBLE_TEMPLATE(template Nd4jLong nonMaxSuppressionGeneric_, (nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize, - double overlapThreshold, double scoreThreshold, NDArray* output), FLOAT_TYPES, INTEGER_TYPES); + float overlapThreshold, float scoreThreshold, NDArray* output, SimiliratyFunc similiratyFunc), FLOAT_TYPES, INTEGER_TYPES); void nonMaxSuppression(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu index 292b7e1c6..aa47e3e88 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu @@ -100,7 +100,7 @@ namespace helpers { val = nudgedMax; } output[shape::getIndexOffset(b * channels + i, outputShape)] = - (math::nd4j_floor((val - nudgedMin) / scale + T(0.5)) * scale + nudgedMin); + (math::nd4j_floor((val - nudgedMin) / scale + T(0.5f)) * scale + nudgedMin); }; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu index a96db0195..be2a71e8c 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu @@ -79,7 +79,49 @@ namespace helpers { return intersectionValue > threshold; } -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template + static __device__ T similirityV3(T* boxes, Nd4jLong* boxesShape, int previousIndex, int nextIndex) { + Nd4jLong previous0[] = {previousIndex, 0}; + Nd4jLong previous1[] = {previousIndex, 1}; + Nd4jLong previous2[] = {previousIndex, 2}; + Nd4jLong previous3[] = {previousIndex, 3}; + Nd4jLong next0[] = {nextIndex, 0}; + Nd4jLong next1[] = {nextIndex, 1}; + Nd4jLong next2[] = {nextIndex, 2}; + Nd4jLong next3[] = {nextIndex, 3}; + + // we have rectangle with given max values. Compute vexes of rectangle first + + T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); + T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]); + T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); + T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]); + T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]); + T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]); + T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]); + T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]); + + // compute areas for comparation + T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev); + T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext); + + // of course, areas should be positive + if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false; + + // compute intersection of rectangles + T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext); + T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext); + T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext); + T maxIntersectionX = nd4j::math::nd4j_min(maxXPrev, maxXNext); + T intersectionArea = + nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) * + nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f)); + T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea); + // final check + return intersectionValue; + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // shouldSelectKernel - compute status for all selected rectangles (boxes) // // we compute boolean flag as shared uint32 and return it on final only for the first thread @@ -139,7 +181,7 @@ namespace helpers { } } - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // nonMaxSuppressionV2 algorithm - given from TF NonMaxSuppressionV2 implementation // template @@ -200,24 +242,33 @@ namespace helpers { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template - static __device__ bool checkOverlapBoxes(T* boxes, Nd4jLong* shape, T* scores, I* indices, I* selectedIndices, I* startIndices, I selectedSize, I nextCandidateIndex, T overlapThreshold, T scoreThreshold) { + static __device__ bool checkOverlapBoxes(T* boxes, Nd4jLong* shape, T* scores, I* indices, I* selectedIndices, I* startIndices, I selectedSize, I nextCandidateIndex, T overlapThreshold, T scoreThreshold, bool simple) { bool shouldHardSuppress = false; T& nextCandidateScore = scores[nextCandidateIndex]; I selectedIndex = indices[nextCandidateIndex]; I finish = startIndices[nextCandidateIndex]; for (int j = selectedSize; j > finish; --j) { - Nd4jLong xPos[] = {selectedIndex, selectedIndices[j - 1]}; - auto xShift = shape::getOffset(shape, xPos, 0); - nextCandidateScore *= (boxes[xShift] <= static_cast(overlapThreshold)?T(1.):T(0.));// + T boxVal; + if (simple) { + Nd4jLong xPos[] = {selectedIndex, selectedIndices[j - 1]}; + auto xShift = shape::getOffset(shape, xPos, 0); + boxVal = boxes[xShift]; + } + else { + boxVal = similirityV3(boxes, shape, selectedIndex, selectedIndices[j - 1]); + } + if (boxVal > static_cast(overlapThreshold)) + nextCandidateScore = static_cast(0.f); + // First decide whether to perform hard suppression - if (boxes[xShift] >= overlapThreshold) { + if (boxVal >= overlapThreshold) { shouldHardSuppress = true; break; } // If nextCandidate survives hard suppression, apply soft suppression - if (nextCandidateScore <= scoreThreshold) break; + if (nextCandidateScore <= static_cast(scoreThreshold)) break; } return shouldHardSuppress; @@ -226,7 +277,7 @@ namespace helpers { template static __global__ void suppressNonMaxOverlapKernel(T* boxes, Nd4jLong* boxesShape, T* scoresData, I* indices, I* startIndices, Nd4jLong length, I maxOutputLen, - T overlapThreshold, T scoreThreshold, I* output, Nd4jLong* outputShape, I* outputLength) { + T overlapThreshold, T scoreThreshold, I* output, Nd4jLong* outputShape, I* outputLength, bool simple) { __shared__ I selectedSize; __shared__ I* tempOutput; @@ -253,7 +304,7 @@ namespace helpers { } // check for overlaps bool shouldHardSuppress = checkOverlapBoxes(boxes, boxesShape, scoresData, indices, tempOutput, startIndices, selectedSize, - nextCandidateIndex, overlapThreshold, scoreThreshold);//false; + nextCandidateIndex, overlapThreshold, scoreThreshold, simple);//false; T nextCandidateScore = scoresData[nextCandidateIndex]; startIndices[nextCandidateIndex] = selectedSize; @@ -285,7 +336,7 @@ namespace helpers { template static Nd4jLong nonMaxSuppressionGeneric_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize, - double overlapThreshold, double scoreThreshold, NDArray* output) { + double overlapThreshold, double scoreThreshold, NDArray* output, bool simple) { auto stream = context->getCudaStream(); if (output) NDArray::prepareSpecialUse({output}, {boxes, scores}); @@ -315,16 +366,16 @@ namespace helpers { Nd4jLong res = 0; if (output) { // this part used when output shape already calculated to fill up values on output DataBuffer selectedSizeBuf(&selectedSize, sizeof(I), DataTypeUtils::fromT()); - suppressNonMaxOverlapKernel <<<1, 1, 1024, *stream >>> (boxes->dataBuffer()->specialAsT(), + suppressNonMaxOverlapKernel<<<1, 1, 1024, *stream >>> (boxes->dataBuffer()->specialAsT(), boxes->specialShapeInfo(), scoresData, indexBuf, startIndices, scores->lengthOf(), (I) outputSize, T(overlapThreshold), T(scoreThreshold), output->dataBuffer()->specialAsT(), output->specialShapeInfo(), - selectedSizeBuf.specialAsT()); + selectedSizeBuf.specialAsT(), simple); } else { // this case used on calculation of output shape. Output and output shape shoulde be nullptr. DataBuffer selectedSizeBuf(&selectedSize, sizeof(I), DataTypeUtils::fromT()); - suppressNonMaxOverlapKernel <<<1, 1, 1024, *stream >>> (boxes->dataBuffer()->specialAsT(), + suppressNonMaxOverlapKernel<<<1, 1, 1024, *stream >>> (boxes->dataBuffer()->specialAsT(), boxes->specialShapeInfo(), scoresData, indexBuf, startIndices, scores->lengthOf(), (I)outputSize, - T(overlapThreshold), T(scoreThreshold), (I*)nullptr, (Nd4jLong*) nullptr, selectedSizeBuf.specialAsT()); + T(overlapThreshold), T(scoreThreshold), (I*)nullptr, (Nd4jLong*) nullptr, selectedSizeBuf.specialAsT(), simple); selectedSizeBuf.syncToPrimary(context, true); res = *selectedSizeBuf.primaryAsT(); } @@ -344,7 +395,16 @@ namespace helpers { Nd4jLong nonMaxSuppressionGeneric(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) { BUILD_DOUBLE_SELECTOR(boxes->dataType(), output ? output->dataType():DataType::INT32, return nonMaxSuppressionGeneric_, - (context, boxes, scales, maxSize, threshold, scoreThreshold, output), + (context, boxes, scales, maxSize, threshold, scoreThreshold, output, true), + FLOAT_TYPES, INDEXING_TYPES); + return boxes->sizeAt(0); + } + + Nd4jLong + nonMaxSuppressionV3(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize, + double overlapThreshold, double scoreThreshold, NDArray* output) { + BUILD_DOUBLE_SELECTOR(boxes->dataType(), output ? output->dataType():DataType::INT32, return nonMaxSuppressionGeneric_, + (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, false), FLOAT_TYPES, INDEXING_TYPES); return boxes->sizeAt(0); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu index ec4fd2a97..42acf9c09 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu @@ -825,37 +825,30 @@ namespace helpers { NDArray::prepareSpecialUse({output}, {input}); auto n2 = input->sizeAt(-1) * input->sizeAt(-2); auto stream = context->getCudaStream(); - std::unique_ptr tempOutput(input->dup()); -// auto inputs = tempOutput->allTensorsAlongDimension({input->rankOf() - 2, input->rankOf() - 1}); -// for (Nd4jLong e = 0; e < packX.numberOfTads(); e++) { -// auto subArray = inputs->at(e); -// cholesky(context, subArray, subArray, true); -// } -// delete inputs; - cholesky(context, input, tempOutput.get(), false); - tempOutput->syncToHost(); - tempOutput->printIndexedBuffer("Cholesky res!!!"); - auto outputBuf = reinterpret_cast(output->specialBuffer()); // + e * n2; // + e * n2; - auto inputBuf = reinterpret_cast(tempOutput->specialBuffer()); - output->assign(0); - output->syncToDevice(); - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput->getShapeInfo(), - {input->rankOf() - 2, - input->rankOf() - 1}); - logDetKernel << < packX.numberOfTads(), n2, 128, *stream >> > - (inputBuf, tempOutput->specialShapeInfo(), packX.numberOfTads(), packX.specialShapeInfo(), packX.specialOffsets(), outputBuf, output->specialShapeInfo()); -// } + NDArray tempOutput(*input); + + cholesky(context, input, &tempOutput, false); + + auto outputBuf = output->dataBuffer()->specialAsT(); //reinterpret_cast(output->specialBuffer()); // + e * n2; // + e * n2; + auto inputBuf = tempOutput.dataBuffer()->specialAsT(); //reinterpret_cast(tempOutput->specialBuffer()); + output->nullify(); + auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput.getShapeInfo(), + {tempOutput.rankOf() - 2, + tempOutput.rankOf() - 1}); + logDetKernel <<< 128, 512, 256, *stream >>>(inputBuf, tempOutput.specialShapeInfo(), + packX.numberOfTads(), packX.specialShapeInfo(), + packX.specialOffsets(), outputBuf, output->specialShapeInfo()); + output->tickWriteDevice(); NDArray::registerSpecialUse({output}, {input}); - //delete tempOutput; return Status::OK(); } int logdetFunctor(nd4j::LaunchContext *context, NDArray *input, NDArray *output) { - BUILD_SINGLE_SELECTOR(output->dataType(), logdetFunctor_, (context, input, output), FLOAT_NATIVE); + BUILD_SINGLE_SELECTOR(output->dataType(), return logdetFunctor_, (context, input, output), FLOAT_NATIVE); } - BUILD_SINGLE_TEMPLATE(template int logdetFunctor_, - (nd4j::LaunchContext * context, NDArray * input, NDArray * output), FLOAT_NATIVE); +// BUILD_SINGLE_TEMPLATE(template int logdetFunctor_, +// (nd4j::LaunchContext * context, NDArray * input, NDArray * output), FLOAT_NATIVE); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu index 7814defe1..613aa548b 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu @@ -35,127 +35,84 @@ namespace helpers { // -------------------------------------------------------------------------------------------------------------- // template - static __global__ void segmentProdLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { - __shared__ T* val; - __shared__ Nd4jLong xLen, zLen, segment, zIndex; + static __global__ void segmentProdLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, + Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { + + __shared__ Nd4jLong xLen, zLen; __shared__ T* x; __shared__ T* z; - __shared__ int threadsPerSegment, start, finish; if (threadIdx.x == 0) { - threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses; - segment = blockIdx.x / threadsPerSegment; x = reinterpret_cast(input); z = reinterpret_cast(output); - extern __shared__ unsigned char shmem[]; - val = reinterpret_cast(shmem); xLen = shape::length(inputShape); zLen = shape::length(outputShape); + } + __syncthreads(); - if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape); - start = starts[segment]; - finish = start + lengths[segment]; - //val[segment] = ; - z[zIndex] = x[shape::getIndexOffset(start, inputShape)]; - val[segment] = z[zIndex]; + for(auto segment = blockIdx.x; segment < numOfClasses; segment += gridDim.x) { + auto zIndex = shape::getIndexOffset(segment, outputShape); + auto start = starts[segment]; + auto finish = start + lengths[segment]; + if (lengths[segment] == 0) { + continue; + } + for (auto e = start + threadIdx.x; e < finish; e += blockDim.x) { + auto xIndex = shape::getIndexOffset(e, inputShape); + nd4j::math::atomics::nd4j_atomicMul(&z[segment], x[xIndex]); } - } - __syncthreads(); -// auto tid = threadIdx.x + blockIdx.x * blockDim.x; -// auto step = blockDim.x * gridDim.x; - for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape); - nd4j::math::atomics::nd4j_atomicMul(&val[segment], x[xIndex]); - } - __syncthreads(); - - if (threadIdx.x == 0) { - z[zIndex] = val[segment]; - } - __syncthreads(); } // -------------------------------------------------------------------------------------------------------------- // template - static __global__ void unsortedSegmentProdLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { - __shared__ T* val; - __shared__ Nd4jLong xLen, zLen, segment, zIndex; - __shared__ T* x; - __shared__ T* z; - __shared__ I* y; //int threadsPerSegment, start, finish; + static __global__ void unsortedSegmentProdLinearKernel(T* input, Nd4jLong* inputShape, I* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, T* output, Nd4jLong* outputShape) { + __shared__ Nd4jLong xLen, zLen; if (threadIdx.x == 0) { -// threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses; - segment = blockIdx.x;// / threadsPerSegment; - x = reinterpret_cast(input); - z = reinterpret_cast(output); - y = reinterpret_cast(indices); -// extern __shared__ unsigned char shmem[]; -// val = reinterpret_cast(shmem); xLen = shape::length(inputShape); zLen = shape::length(outputShape); - -// if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape); - //start = starts[segment]; - //finish = start + lengths[segment]; - if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)]; - else - z[zIndex] = 0; //DataTypeUtils::max(); -// val[segment] = z[zIndex]; -// } - } __syncthreads(); - if (lengths[segment] > 0) - for (auto e = threadIdx.x; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape); - auto yIndex = shape::getIndexOffset(e, indicesShape); - if (y[yIndex] == segment && e != starts[segment]) { - nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); - } + auto start = threadIdx.x + blockIdx.x * blockDim.x; + auto step = blockDim.x * gridDim.x; + for (auto idx = start; idx < xLen; idx += step) { + auto xIndex = shape::getIndexOffset(idx, inputShape); + auto yIndex = shape::getIndexOffset(idx, indicesShape); + auto segment = indices[yIndex]; + auto zIndex = shape::getIndexOffset(segment, outputShape); + if (lengths[segment] == 0) { + continue; } + nd4j::math::atomics::nd4j_atomicMul(&output[zIndex], input[xIndex]); + } } // -------------------------------------------------------------------------------------------------------------- // // SegmentProd kernel template - static __global__ void segmentProdTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { - __shared__ T* val; - __shared__ Nd4jLong len, segment, zIndex, total; - __shared__ T* z; - __shared__ int threadsPerSegment, start, finish; + static __global__ void segmentProdTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, + Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, + Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { + + __shared__ Nd4jLong len, total; if (threadIdx.x == 0) { - segment = indices[blockIdx.x]; // / threadsPerSegment; - z = reinterpret_cast(outputBuf) + outputTadOffsets[segment]; - len = shape::length(inputTads); - start = starts[segment]; - finish = start + lengths[segment]; total = shape::sizeAt(inputShape, 0); - + len = shape::length(inputTads); } __syncthreads(); - auto idx = blockIdx.x; - if (blockIdx.x <= total) { + for (auto idx = blockIdx.x; idx < total; idx += gridDim.x) { auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; - if (blockIdx.x == start) { - for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads); - auto zIndex = shape::getIndexOffset(e, outputTads); - nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); - } - } - else { - for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads); - auto zIndex = shape::getIndexOffset(e, outputTads); - if (lengths[segment] > 0) - nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); - } + auto segment = indices[idx]; // / threadsPerSegment; + auto z = reinterpret_cast(outputBuf) + outputTadOffsets[segment]; + auto start = starts[segment]; + auto finish = start + lengths[segment]; + if (lengths[segment] == 0) continue; + for (auto e = threadIdx.x; e < len; e += blockDim.x) { + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); + nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]); } } } @@ -177,7 +134,7 @@ namespace helpers { int* lengths = reinterpret_cast(classesRangesLens.specialBuffer()); if (input->isVector()) { - segmentProdLinearKernel<<lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo()); + segmentProdLinearKernel<<<128, 256, 128, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo()); } else { std::vector dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -187,7 +144,7 @@ namespace helpers { Nd4jLong* inputTadOffsets = packX.specialOffsets(); Nd4jLong* outputTads = packZ.specialShapeInfo(); Nd4jLong* outputTadOffsets = packZ.specialOffsets(); - segmentProdTadKernel<<sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); + segmentProdTadKernel<<<128, 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); } } @@ -214,12 +171,15 @@ namespace helpers { fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens); int* begins = reinterpret_cast(classesRangesBegs.specialBuffer()); int* lengths = reinterpret_cast(classesRangesLens.specialBuffer()); + output->assign(1); if (input->isVector()) { - unsortedSegmentProdLinearKernel<<>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo()); + unsortedSegmentProdLinearKernel<<<128, 256, 256, *stream>>>( + input->dataBuffer()->specialAsT(), input->specialShapeInfo(), + indices->dataBuffer()->specialAsT(), indices->specialShapeInfo(), begins, lengths, numOfClasses, + output->dataBuffer()->specialAsT(), output->specialShapeInfo()); } else { - output->assign(1); std::vector dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions); auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions); @@ -228,7 +188,7 @@ namespace helpers { Nd4jLong* outputTads = packZ.specialShapeInfo(); Nd4jLong* outputTadOffsets = packZ.specialOffsets(); dims.x = input->sizeAt(0); - segmentProdTadKernel<<>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); + segmentProdTadKernel<<<128, 256, 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu index f4237ac44..4daa0f4ab 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu @@ -32,82 +32,52 @@ namespace ops { namespace helpers { // -------------------------------------------------------------------------------------------------------------- // template - static __global__ void unsortedSegmentSqrtNLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { - __shared__ T* val; - __shared__ Nd4jLong xLen, zLen, segment, zIndex; - __shared__ T* x; - __shared__ T* z; - __shared__ I* y; //int threadsPerSegment, start, finish; + static __global__ void unsortedSegmentSqrtNLinearKernel(T* input, Nd4jLong* inputShape, I* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, T* output, Nd4jLong* outputShape) { + __shared__ Nd4jLong xLen, zLen; if (threadIdx.x == 0) { -// threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses; - segment = blockIdx.x;// / threadsPerSegment; - x = reinterpret_cast(input); - z = reinterpret_cast(output); - y = reinterpret_cast(indices); -// extern __shared__ unsigned char shmem[]; -// val = reinterpret_cast(shmem); xLen = shape::length(inputShape); zLen = shape::length(outputShape); - -// if (segment < numOfClasses) { - zIndex = shape::getIndexOffset(segment, outputShape); - //start = starts[segment]; - //finish = start + lengths[segment]; - if (lengths[segment] > 0) - z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)] / nd4j::math::nd4j_sqrt(lengths[segment]); - else - z[zIndex] = 0; //DataTypeUtils::max(); -// val[segment] = z[zIndex]; -// } - } __syncthreads(); - if (lengths[segment] > 0) - for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputShape); - auto yIndex = shape::getIndexOffset(e, indicesShape); - if (y[yIndex] == segment && e != starts[segment]) { - nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); - } - } + + auto start = threadIdx.x + blockIdx.x * blockDim.x; + auto step = blockDim.x * gridDim.x; + + for (auto idx = start; idx < xLen; idx += step) { + auto yIndex = shape::getIndexOffset(idx, indicesShape); + auto segment = indices[yIndex]; + auto zIndex = shape::getIndexOffset(segment, outputShape); + if (lengths[segment] == 0) continue; + auto xIndex = shape::getIndexOffset(idx, inputShape); + + nd4j::math::atomics::nd4j_atomicAdd(&output[zIndex], input[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); + } } // -------------------------------------------------------------------------------------------------------------- // // SegmentSqrtN kernel template - static __global__ void segmentSqrtNTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { - __shared__ T* val; - __shared__ Nd4jLong len, segment, zIndex, total; - __shared__ T* z; - __shared__ int threadsPerSegment, start, finish; + static __global__ void segmentSqrtNTadKernel(T* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { + + __shared__ Nd4jLong len, total; if (threadIdx.x == 0) { - segment = indices[blockIdx.x]; // / threadsPerSegment; - z = reinterpret_cast(outputBuf) + outputTadOffsets[segment]; - len = shape::length(inputTads); - start = starts[segment]; - finish = start + lengths[segment]; total = shape::sizeAt(inputShape, 0); - + len = shape::length(inputTads); } __syncthreads(); - auto idx = blockIdx.x; - if (blockIdx.x <= total) { - auto x = reinterpret_cast(inputBuf) + inputTadOffsets[idx]; - if (blockIdx.x == start) { - for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads); - auto zIndex = shape::getIndexOffset(e, outputTads); - z[zIndex] = x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment]); - } - } - else { - for (auto e = threadIdx.x; e < len; e += blockDim.x) { - auto xIndex = shape::getIndexOffset(e, inputTads); - auto zIndex = shape::getIndexOffset(e, outputTads); - nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); - } + for (auto idx = blockIdx.x; idx < total; idx += gridDim.x) { + auto segment = indices[idx]; + auto x = inputBuf + inputTadOffsets[idx]; + auto z = reinterpret_cast(outputBuf) + outputTadOffsets[segment]; + auto start = starts[segment]; + auto finish = start + lengths[segment]; + + for (auto e = threadIdx.x; e < len; e += blockDim.x) { + auto xIndex = shape::getIndexOffset(e, inputTads); + auto zIndex = shape::getIndexOffset(e, outputTads); + nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt(lengths[segment])); } } } @@ -122,17 +92,21 @@ namespace helpers { // classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes); classesRangesBegs.assign(indices->lengthOf()); classesRangesLens.assign(0); - dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32); +// dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32); + dim3 dims(128, 256, 256); // int* classesBuf = reinterpret_cast(classes.specialBuffer()); fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens); int* begins = reinterpret_cast(classesRangesBegs.specialBuffer()); int* lengths = reinterpret_cast(classesRangesLens.specialBuffer()); - + output->nullify(); if (input->isVector()) { - unsortedSegmentSqrtNLinearKernel<<>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo()); + unsortedSegmentSqrtNLinearKernel<<>>( + input->dataBuffer()->specialAsT(), input->specialShapeInfo(), + indices->dataBuffer()->specialAsT(), indices->specialShapeInfo(), begins, lengths, numOfClasses, + output->dataBuffer()->specialAsT(), output->specialShapeInfo()); } else { - output->assign(0); + output->nullify(); std::vector dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions); auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions); @@ -141,7 +115,9 @@ namespace helpers { Nd4jLong* outputTads = packZ.specialShapeInfo(); Nd4jLong* outputTadOffsets = packZ.specialOffsets(); dims.x = input->sizeAt(0); - segmentSqrtNTadKernel<<>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); + segmentSqrtNTadKernel<<>>( + input->dataBuffer()->specialAsT(), input->specialShapeInfo(), inputTads, inputTadOffsets, indices->dataBuffer()->specialAsT(), + begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); } } // -------------------------------------------------------------------------------------------------------------- // diff --git a/libnd4j/include/ops/declarable/helpers/image_suppression.h b/libnd4j/include/ops/declarable/helpers/image_suppression.h index 85224e0f5..4013b72b3 100644 --- a/libnd4j/include/ops/declarable/helpers/image_suppression.h +++ b/libnd4j/include/ops/declarable/helpers/image_suppression.h @@ -28,6 +28,8 @@ namespace helpers { void nonMaxSuppression(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double overlapThreshold, double scoreThreshold, NDArray* output); + Nd4jLong nonMaxSuppressionV3(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, + double overlapThreshold, double scoreThreshold, NDArray* output); Nd4jLong nonMaxSuppressionGeneric(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize, double overlapThreshold, double scoreThreshold, NDArray* output); diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h index 55cd4033a..7aa4dbbe6 100644 --- a/libnd4j/include/templatemath.h +++ b/libnd4j/include/templatemath.h @@ -1446,35 +1446,63 @@ inline __device__ unsigned char nd4j_atomicMul(unsigned char* add return (uint8_t)old; } -template <> -inline __device__ int16_t nd4j_atomicMul(int16_t* address, int16_t val) { +template +static inline __device__ T internal_16bit_atomicMul(T* address, int16_t val) { size_t shift = ((size_t)address & 2); int *base_address = (int *)((char*)address - shift); - int old = val, assumed; - //printf("%u %x", *base_address); - do { - assumed = old; - old = atomicCAS(base_address, assumed, (old * val)); - } while (assumed != old); + union I16PAIR { + struct { + T H; + T L; + } B; + int W; - return (int16_t)old; + __host__ __device__ + I16PAIR() {}; + + __host__ __device__ + ~I16PAIR() {}; + }; + + I16PAIR pairNew, pairOld, pairAssumed; + + pairOld.W = (int) val; + if (reinterpret_cast(address) == base_address) { + do { + + pairNew.B.L = pairOld.B.L; + pairNew.B.H = pairOld.B.H * val; + pairAssumed.W = pairOld.W; + + pairOld.W = atomicCAS(base_address, pairAssumed.W, pairNew.W); + } while (pairAssumed.W != pairOld.W); + + return (T) pairOld.B.H; + } else { + do { + + pairNew.B.H = pairOld.B.H; + pairNew.B.L = pairOld.B.L * val; + pairAssumed.W = pairOld.W; + pairOld.W = atomicCAS(base_address, pairAssumed.W, pairNew.W); + + } while (pairAssumed.W != pairOld.W); + + return (T) pairOld.B.L; + } + +} + + +template <> +inline __device__ int16_t nd4j_atomicMul(int16_t* address, int16_t val) { + return internal_16bit_atomicMul(address, val); } template <> inline __device__ uint16_t nd4j_atomicMul(uint16_t* address, uint16_t val) { - size_t shift = ((size_t)address & 2); - unsigned int *base_address = (unsigned int *)((char*)address - shift); - unsigned int old = val, assumed; - //printf("%u %x", *base_address); - do { - - assumed = old; - old = atomicCAS(base_address, assumed, (old * val)); - } while (assumed != old); - - return (uint16_t)old; - + return internal_16bit_atomicMul(address, val); } template <> @@ -1547,105 +1575,27 @@ inline __device__ Nd4jLong nd4j_atomicMul(Nd4jLong* address, Nd4jLong template <> inline __device__ bfloat16 nd4j_atomicMul(bfloat16* address, bfloat16 val) { - auto address_as_ull = (int*) address; - - long addr = (long)(address); - bool misaligned = addr & 0x3; - - if (misaligned) - address_as_ull = (int *) (address - 1); - - BPAIR old, assumed, fresh; - - old.W = *address_as_ull; - do { - - if (!misaligned) { - bfloat16 res = old.B.H * val; - fresh.B.H = res; - fresh.B.L = old.B.L; - } else { - bfloat16 res = old.B.L * val; - fresh.B.L = res; - fresh.B.H = old.B.H; - } - - assumed.W = old.W; - old.W = atomicCAS(address_as_ull, assumed.W, fresh.W); - } while (assumed.W != old.W); - - if (!misaligned) return old.B.H; - else return old.B.L; + return internal_16bit_atomicMul(address, val); } template <> inline __device__ float16 nd4j_atomicMul(float16* address, float16 val) { - auto address_as_ull = (int*) address; - - long addr = (long)(address); - bool misaligned = addr & 0x3; - - if (misaligned) - address_as_ull = (int *) (address - 1); - - BPAIR old, assumed, fresh; - - old.W = *address_as_ull; - do { - - if (!misaligned) { - bfloat16 res = old.B.H * val; - fresh.B.H = res; - fresh.B.L = old.B.L; - } else { - bfloat16 res = old.B.L * val; - fresh.B.L = res; - fresh.B.H = old.B.H; - } - - assumed.W = old.W; - old.W = atomicCAS(address_as_ull, assumed.W, fresh.W); - } while (assumed.W != old.W); - - if (!misaligned) return old.B.H; - else return old.B.L; + return internal_16bit_atomicMul(address, val); } template <> inline __device__ float nd4j_atomicDiv(float* address, float val) { - int* address_as_ull = - (int*)address; - int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __float_as_int(__int_as_float(assumed) / val )); - } while (assumed != old); - return __int_as_float(old); + return nd4j_atomicMul(address, (float) 1.f / val); } template <> inline __device__ float16 nd4j_atomicDiv(float16* address, float16 val) { - int* address_as_ull = - (int*)address; - int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __float_as_int(val * - __float_as_int(assumed))); - } while (assumed != old); - return __int_as_float(old); + return nd4j_atomicMul(address, (float16) 1.f / val); } + template <> inline __device__ bfloat16 nd4j_atomicDiv(bfloat16* address, bfloat16 val) { - int* address_as_ull = - (int*)address; - int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __float_as_int(val * - __float_as_int(assumed))); - } while (assumed != old); - return __int_as_float(old); + return nd4j_atomicMul(address, (bfloat16) 1.f / val); } } #endif diff --git a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu new file mode 100644 index 000000000..0ede6398c --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu @@ -0,0 +1,86 @@ +/******************************************************************************* + * Copyright (c) 2019 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include +#include +#include +#include +#include +#include + + +using namespace nd4j; + + +class AtomicTests : public testing::Test { +public: + AtomicTests() { + // + } +}; + +template +static _CUDA_G void multiplyKernel(void *vbuffer, uint64_t length, void *vresult) { + auto buffer = reinterpret_cast(vbuffer); + auto result = reinterpret_cast(vresult); + + auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (auto e = tid; e < length; e += gridDim.x * blockDim.x) { + auto rem = e % 4; + auto i = (e - rem) / 4; + + nd4j::math::atomics::nd4j_atomicMul(&result[i], buffer[e]); + } +} + +template +static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) { + multiplyKernel<<<256, 256, 1024, *nd4j::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult); + auto err = cudaStreamSynchronize(*nd4j::LaunchContext::defaultContext()->getCudaStream()); + if (err != 0) + nd4j::cuda_exception::build("multiply failed", err); +} + +static void multiplyHost(NDArray &input, NDArray &output) { + BUILD_SINGLE_SELECTOR(input.dataType(), multiplyLauncher, (input.specialBuffer(), input.lengthOf(), output.specialBuffer()), NUMERIC_TYPES); +} + +TEST_F(AtomicTests, test_multiply) { + std::vector dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::INT16}; + + for (auto t:dtypes) { + nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str()); + NDArray input('c', {4, 25}, t); + NDArray output('c', {input.lengthOf() / 4}, t); + NDArray exp = output.ulike(); + + input.assign(2); + output.assign(2); + exp.assign(32); + + multiplyHost(input, output); + ASSERT_EQ(exp, output); + } + + + +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp index 30ceeea98..8a889ea44 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp @@ -2330,6 +2330,76 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_5) { delete results; } +TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_6) { + + NDArray boxes = NDArrayFactory::create('c', {3, 4}, {0.8115f, 0.4121f, 0.0771f, 0.4863f, + 0.7412f, 0.7607f, 0.1543f, 0.5479f, + 0.8223f, 0.2246f, 0.0049f, 0.6465f}); + NDArray scales = NDArrayFactory::create('c', {3}, {0.0029f, 0.8135f, 0.4873f}); //3, 0, 1, 2, 4, 5 + NDArray expected = NDArrayFactory::create('c', {2}, {1,2}); + NDArray maxSize = NDArrayFactory::create(2); + NDArray threshold = NDArrayFactory::create(0.5f); + NDArray scoreThreshold = NDArrayFactory::create(-DataTypeUtils::infOrMax()); + nd4j::ops::non_max_suppression_v3 op; + auto results = op.execute({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {}); + + ASSERT_EQ(Status::OK(), results->status()); + + NDArray* result = results->at(0); +// result->printBuffer("NonMaxSuppression OUtput6"); +// result->printShapeInfo("Ouput6 shape is"); + ASSERT_TRUE(expected.isSameShapeStrict(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; +} + +TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_06) { + + NDArray boxes = NDArrayFactory::create('c', {3, 4}, {0.8115f, 0.4121f, 0.0771f, 0.4863f, + 0.7412f, 0.7607f, 0.1543f, 0.5479f, + 0.8223f, 0.2246f, 0.0049f, 0.6465f}); + NDArray scales = NDArrayFactory::create('c', {3}, {0.0029f, 0.8135f, 0.4873f}); //3, 0, 1, 2, 4, 5 + NDArray expected = NDArrayFactory::create('c', {2}, {1,2}); + NDArray maxSize = NDArrayFactory::create(2); + NDArray threshold = NDArrayFactory::create(0.5f); + NDArray scoreThreshold = NDArrayFactory::create(-DataTypeUtils::infOrMax()); + nd4j::ops::non_max_suppression_v3 op; + auto results = op.execute({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {}); + + ASSERT_EQ(Status::OK(), results->status()); + + NDArray* result = results->at(0); +// result->printBuffer("NonMaxSuppression OUtput06"); +// result->printShapeInfo("Ouput06 shape is"); + ASSERT_TRUE(expected.isSameShapeStrict(result)); + ASSERT_TRUE(expected.equalsTo(result)); + + delete results; +} + +TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_7) { + + NDArray boxes = NDArrayFactory::create('c', {3, 4}, {0.7788f, 0.8012f, 0.7244f, 0.2329f, + 0.7271f, 0.1804f, 0.5056f, 0.8929f, + 0.5461f, 0.9234f, 0.0856f, 0.7938f}); + NDArray scales = NDArrayFactory::create('c', {3}, {0.7717f, 0.9281f, 0.9846f}); //3, 0, 1, 2, 4, 5 + NDArray maxSize = NDArrayFactory::create(0); + NDArray threshold = NDArrayFactory::create(0.5f); + NDArray scoreThreshold = NDArrayFactory::create(0.5f); + nd4j::ops::non_max_suppression_v3 op; + auto results = op.execute({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {}); + + ASSERT_EQ(Status::OK(), results->status()); + + NDArray* result = results->at(0); +// result->printBuffer("NonMaxSuppression OUtput7"); +// result->printShapeInfo("Ouput6 shape is"); + ASSERT_TRUE(result->isEmpty()); + + delete results; +} + //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressingOverlap_1) { @@ -2720,22 +2790,22 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) { NDArray x = NDArrayFactory::create('c', {2,4,5,3}); NDArray exp = NDArrayFactory::create('c', {2,4,5,3},{ - 1.0588236, 1.9607843, 3.019608, 4.0588236, 5.098039, 6.039216, 7.0588236, 8.039216, 9.058824, - 10.058824, 10.980392, 12.078432, 13.058824, 13.921569, 15.09804, 16.058825, 17.058825, 18.117647, - 19.058825, 20., 21.137257, 22.058825, 22.941177, 23.882355, 25.058825, 26.078432, 26.901962, - 28.058825, 29.019608, 29.92157, 31.058825, 31.960785, 32.941177, 34.058823, 35.09804, 35.960785, - 37.058823, 38.039215, 38.980392, 40.058823, 40.980392, 42.000004, 43.058826, 43.92157, 45.01961, - 45., 47.058823, 48.03922, 45., 50., 51.058826, 45., 50., 54.078434, - 45., 50., 57.09804, 45., 50., 60.11765, 45., 50., 62.862747, - 45., 50., 65.882355, 45., 50., 68.90196, 45., 50., 70., - 45., 50., 70., 45., 50., 70., 45., 50., 70., - 45., 50., 70., 45., 50., 70., 45., 50., 70., - 45., 50., 70., 45., 50., 70., 45., 50., 70., - 45., 50., 70., 45., 50., 70., 45., 50., 70., - 45., 50., 70., 45., 50., 70., 45., 50., 70., - 45., 50., 70.}); - NDArray min = NDArrayFactory::create({20., 20., 20.}); - NDArray max = NDArrayFactory::create({65., 70., 90.}); + 1.0588236f, 1.9607843f, 3.019608f, 4.0588236f, 5.098039f, 6.039216f, 7.0588236f, 8.039216f, 9.058824f, + 10.058824f, 10.980392f, 12.078432f, 13.058824f, 13.921569f, 15.09804f, 16.058825f, 17.058825f, 18.117647f, + 19.058825f, 20.f, 21.137257f, 22.058825f, 22.941177f, 23.882355f, 25.058825f, 26.078432f, 26.901962f, + 28.058825f, 29.019608f, 29.92157f, 31.058825f, 31.960785f, 32.941177f, 34.058823f, 35.09804f, 35.960785f, + 37.058823f, 38.039215f, 38.980392f, 40.058823f, 40.980392f, 42.000004f, 43.058826f, 43.92157f, 45.01961f, + 45.f, 47.058823f, 48.03922f, 45.f, 50.f, 51.058826f, 45.f, 50.f, 54.078434f, + 45.f, 50.f, 57.09804f, 45.f, 50.f, 60.11765f, 45.f, 50.f, 62.862747f, + 45.f, 50.f, 65.882355f, 45.f, 50.f, 68.90196f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, 45.f, 50.f, 70.f, + 45.f, 50.f, 70.f}); + NDArray min = NDArrayFactory::create({20.f, 20.f, 20.f}); + NDArray max = NDArrayFactory::create({65.f, 70.f, 90.f}); x.linspace(1.); nd4j::ops::fake_quant_with_min_max_vars_per_channel op; auto results = op.execute({&x, &min, &max}, {}, {}); @@ -2756,36 +2826,36 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) { TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_5) { NDArray x = NDArrayFactory::create('c', {2, 3, 5, 4}); NDArray exp = NDArrayFactory::create('c', {2, 3, 5, 4},{ - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -19.92157 , -18.980392 , -18.039217 , -16.941177 , - -16. , -15.058824 , -13.960785 , -13.0196085 , - -11.92157 , -10.980392 , -10.039217 , -8.941177 , - -8.000001 , -7.0588236 , -5.960785 , -5.0196085 , - -3.9215698 , -2.9803925 , -2.039217 , -0.94117737, - 0. , 0.94117737, 2.039215 , 2.9803925 , - 4.07843 , 5.0196075 , 5.960783 , 7.0588226 , - 8. , 8.941177 , 10.039215 , 10.980392 , - 12.07843 , 13.019608 , 13.960783 , 15.058823 , - 16. , 16.941177 , 18.039217 , 18.980392 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 , - 20.07843 , 21.019608 , 21.960783 , 23.058823 + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -19.92157f, -18.980392f, -18.039217f, -16.941177f, + -16.f, -15.058824f, -13.960785f, -13.0196085f, + -11.92157f, -10.980392f, -10.039217f, -8.941177f, + -8.000001f, -7.0588236f, -5.960785f, -5.0196085f, + -3.9215698f, -2.9803925f, -2.039217f, -0.94117737f, + 0.f, 0.94117737f, 2.039215f, 2.9803925f, + 4.07843f, 5.0196075f, 5.960783f, 7.0588226f, + 8.f, 8.941177f, 10.039215f, 10.980392f, + 12.07843f, 13.019608f, 13.960783f, 15.058823f, + 16.f, 16.941177f, 18.039217f, 18.980392f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f, + 20.07843f, 21.019608f, 21.960783f, 23.058823f }); NDArray min = NDArrayFactory::create({-20., -19., -18., -17}); NDArray max = NDArrayFactory::create({20., 21., 22., 23}); @@ -2833,9 +2903,10 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_6) { // 0.0877f, 0.5966f, 0.6600f, 0.3513f, 0.1604f // }); - NDArray exp = NDArrayFactory::create('c', {3,5}, {0.77700233, 0.596913, 0.72314, 0.23104, 0.50982356, - 0.17930824, 0.50528157, 0.86846, 0.34995764, 0.50982356, - 0.08735529, 0.596913, 0.6574, 0.34995764, 0.15974471}); + NDArray exp = NDArrayFactory::create('c', {3,5}, { + 0.77700233f, 0.596913f, 0.72314f, 0.23104f, 0.50982356f, + 0.17930824f, 0.50528157f, 0.86846f, 0.34995764f, 0.50982356f, + 0.08735529f, 0.596913f, 0.6574f, 0.34995764f, 0.15974471f}); NDArray min = NDArrayFactory::create('c', {5}, {-0.2283f, -0.0719f, -0.0154f, -0.5162f, -0.3567f}); NDArray max = NDArrayFactory::create('c', {5}, {0.9441f, 0.5957f, 0.8669f, 0.3502f, 0.5100f}); // x.linspace(-60.); @@ -2856,45 +2927,74 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_6) { delete results; } -//////////////////////////////////////////////////////////////////// -//TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_7) { -// -// NDArray x = NDArrayFactory::create('c', {100}); -// NDArray exp = NDArrayFactory::create('c', {100}, { -// 0.f, 0.f, 0.f , 0.f , 0.06666667f, 0.06666667f , -// 0.06666667, 0.06666667, 0.06666667, 0.06666667, 0.06666667, 0.13333334 , -// 0.13333334, 0.13333334, 0.13333334, 0.13333334, 0.13333334, 0.20000002 , -// 0.20000002, 0.20000002, 0.20000002, 0.20000002, 0.20000002, 0.20000002 , -// 0.26666668, 0.26666668, 0.26666668, 0.26666668, 0.26666668, 0.26666668 , -// 0.26666668, 0.33333334, 0.33333334, 0.33333334, 0.33333334, 0.33333334 , -// 0.33333334, 0.40000004, 0.40000004, 0.40000004, 0.40000004, 0.40000004 , -// 0.40000004, 0.40000004, 0.4666667 , 0.4666667 , 0.4666667 , 0.4666667 , -// 0.4666667 , 0.4666667 , 0.4666667 , 0.53333336, 0.53333336, 0.53333336 , -// 0.53333336, 0.53333336, 0.53333336, 0.6 , 0.6 , 0.6 , -// 0.6 , 0.6 , 0.6 , 0.6 , 0.6666667 , 0.6666667 , -// 0.6666667 , 0.6666667 , 0.6666667 , 0.6666667 , 0.6666667 , 0.73333335 , -// 0.73333335, 0.73333335, 0.73333335, 0.73333335, 0.73333335, 0.8000001 , -// 0.8000001 , 0.8000001 , 0.8000001 , 0.8000001 , 0.8000001 , 0.8000001 , -// 0.86666673, 0.86666673, 0.86666673, 0.86666673, 0.86666673, 0.86666673 , -// 0.86666673, 0.9333334 , 0.9333334 , 0.9333334 , 0.9333334 , 0.9333334 , -// 0.9333334 , 1., 1., 1., -// }); -// NDArray min = NDArrayFactory::create('c', {1},{0.0f}); -// NDArray max = NDArrayFactory::create('c', {1}, {1.f}); -// x.linspace(0., 0.01); -// nd4j::ops::fake_quant_with_min_max_vars op; -// auto results = op.execute({&x, &min, &max}, {}, {}); -// -// ASSERT_EQ(ND4J_STATUS_OK, results->status()); -// -// auto result = results->at(0); +////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_7) { + + NDArray x = NDArrayFactory::create('c', {100}); + NDArray exp = NDArrayFactory::create('c', {100}, { + 0.f, 0.01176471f, 0.01960784f, 0.03137255f, 0.03921569f, + 0.0509804f, 0.05882353f, 0.07058824f, 0.07843138f, 0.09019608f, + 0.09803922f, 0.10980393f, 0.12156864f, 0.12941177f, 0.14117648f, + 0.14901961f, 0.16078432f, 0.16862746f, 0.18039216f, 0.18823531f, + 0.20000002f, 0.21176472f, 0.21960786f, 0.23137257f, 0.2392157f, + 0.2509804f, 0.25882354f, 0.27058825f, 0.2784314f, 0.2901961f, + 0.3019608f, 0.30980393f, 0.32156864f, 0.32941177f, 0.34117648f, + 0.34901962f, 0.36078432f, 0.36862746f, 0.3803922f, 0.38823533f, + 0.40000004f, 0.41176474f, 0.41960788f, 0.43137258f, 0.43921572f, + 0.45098042f, 0.45882356f, 0.47058827f, 0.4784314f, 0.4901961f, + 0.49803925f, 0.50980395f, 0.52156866f, 0.5294118f, 0.5411765f, + 0.54901963f, 0.56078434f, 0.5686275f, 0.5803922f, 0.5882353f, + 0.6f, 0.6117647f, 0.61960787f, 0.6313726f, 0.6392157f, + 0.6509804f, 0.65882355f, 0.67058825f, 0.6784314f, 0.6901961f, + 0.69803923f, 0.70980394f, 0.72156864f, 0.7294118f, 0.7411765f, + 0.7490196f, 0.7607844f, 0.7686275f, 0.7803922f, 0.78823537f, + 0.8000001f, 0.8117648f, 0.8196079f, 0.8313726f, 0.83921576f, + 0.85098046f, 0.8588236f, 0.8705883f, 0.87843144f, 0.89019614f, + 0.8980393f, 0.909804f, 0.9215687f, 0.9294118f, 0.94117653f, + 0.9490197f, 0.9607844f, 0.9686275f, 0.9803922f, 0.98823535f + }); + NDArray min = NDArrayFactory::create('c', {1},{0.0f}); + NDArray max = NDArrayFactory::create('c', {1}, {1.f}); + x.linspace(0., 0.01); + nd4j::ops::fake_quant_with_min_max_vars op; + auto results = op.execute({&x, &min, &max}, {}, {}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto result = results->at(0); // result->printBuffer("Quantized7"); // exp.printBuffer("Expected 7"); -// ASSERT_TRUE(exp.isSameShapeStrict(result)); -// ASSERT_TRUE(exp.equalsTo(result)); -// -// delete results; -//} + ASSERT_TRUE(exp.isSameShapeStrict(result)); + ASSERT_TRUE(exp.equalsTo(result)); + + delete results; +} + +////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) { + + NDArray x = NDArrayFactory::create('c', {10}); + NDArray exp = NDArrayFactory::create('c', {10}, { + 0.f, 0.09803922f, 0.20000002f, 0.3019608f, 0.40000004f, 0.49803925f, + 0.6f, 0.69803923f, 0.8000001f, 0.8980393f + }); + NDArray min = NDArrayFactory::create('c', {1},{0.0f}); + NDArray max = NDArrayFactory::create('c', {1}, {1.f}); + x.linspace(0., 0.1); + nd4j::ops::fake_quant_with_min_max_vars op; + auto results = op.execute({&x, &min, &max}, {}, {}); + + ASSERT_EQ(ND4J_STATUS_OK, results->status()); + + auto result = results->at(0); +// x.printBuffer("SourInput8"); +// result->printBuffer("Quantized8"); +// exp.printBuffer("Expected 8"); + ASSERT_TRUE(exp.isSameShapeStrict(result)); + ASSERT_TRUE(exp.equalsTo(result)); + + delete results; +} //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests10, batchnorm_test1) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp index be25f0c62..3ebcbd016 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp @@ -1523,14 +1523,12 @@ TEST_F(DeclarableOpsTests6, LogDet_1) { auto x = NDArrayFactory::create('c', {2, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98, 4,1.2,-1.6,1.2,3.7,-4.3,-1.6,-4.3,9.8}); auto exp = NDArrayFactory::create({ 3.5835189, 4.159008}); - //x.printIndexedBuffer("Input"); nd4j::ops::logdet op; auto result = op.execute({&x}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp index e9fe7264e..a79633c03 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp @@ -1813,8 +1813,8 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_5) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests7, TestSegmentSum_1) { - auto x = NDArrayFactory::create({1.8, 2.5,4., 9., 2.1, 2.4,3.,9., 2.1, 2.1,0.7, 0.1, 3., 4.2, 2.2, 1. }); - auto idx = NDArrayFactory::create({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4}); + auto x = NDArrayFactory::create({1.8, 2.5, 4., 9., 2.1, 2.4, 3., 9., 2.1, 2.1, 0.7, 0.1, 3., 4.2, 2.2, 1. }); + auto idx = NDArrayFactory::create({ 0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4}); auto exp = NDArrayFactory::create({4.3, 17.5, 3., 13.2, 11.2}); nd4j::ops::segment_sum op; @@ -2231,6 +2231,27 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); + auto res = result->at(0); +// res->printIndexedBuffer("Segment prod 05"); + ASSERT_TRUE(exp.equalsTo(result->at(0))); + + delete result; +} +//////////////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests7, TestSegmentProd_05_1) { + auto x = NDArrayFactory::create({1,2,3,4,5,6,7,8 }); + +// ---------------------------------------------------------------- + + auto idx = NDArrayFactory::create({0,0,1,2,2,2,3,3}); + auto exp = NDArrayFactory::create({ 2, 3, 120, 56}); + + nd4j::ops::segment_prod op; + + auto result = op.execute({&x, &idx}, {}, {}); + ASSERT_EQ(result->status(), Status::OK()); + auto res = result->at(0); +// res->printIndexedBuffer("Segment prod 05_1"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2270,6 +2291,23 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_07) { delete result; } +//////////////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests7, TestSegmentProd_08) { + auto x = NDArrayFactory::create({'\x1','\x2','\x3','\x4','\x5','\x6','\x7','\x8', '\x9', '\xA' }); + +// ---------------------------------------------------------------- + + auto idx = NDArrayFactory::create({0,0,2,2,2,2,3,3,3,3}); + auto exp = NDArrayFactory::create({ 2, 1,360, 5040}); + nd4j::ops::segment_prod op; + + auto result = op.execute({&x, &idx}, {}, {}); + ASSERT_EQ(result->status(), Status::OK()); + ASSERT_TRUE(exp.equalsTo(result->at(0))); + + delete result; +} + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_1) { auto x = NDArrayFactory::create({1.8, 2.5,4., 9., 2.1, 2.4,3.,9., 2.1, 2.1,0.7, 0.1, 3., 4.2, 2.2, 1.}); @@ -2341,6 +2379,22 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_12) { delete result; } +//////////////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_08) { + auto x = NDArrayFactory::create({'\x1','\x2','\x3','\x4','\x5','\x6','\x7','\x8', '\x9', '\xA' }); + +// ---------------------------------------------------------------- + + auto idx = NDArrayFactory::create({0,0,2,2,2,2,3,3,3,3}); + auto exp = NDArrayFactory::create({ 2, 1,360, 5040}); + nd4j::ops::unsorted_segment_prod op; + + auto result = op.execute({&x, &idx}, {}, {4}); + ASSERT_EQ(result->status(), Status::OK()); + ASSERT_TRUE(exp.equalsTo(result->at(0))); + + delete result; +} //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_3) { @@ -2401,6 +2455,41 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_4) { delete result; } +//////////////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_5) { + auto x = NDArrayFactory::create('c', {8, 15}); + +// ---------------------------------------------------------------- + + auto idx = NDArrayFactory::create({3, 1, 2, 1, 2, 3, 2, 1}); + auto exp = NDArrayFactory::create('c', {4, 15}, { + 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., + 78016., 85493., 93312., 101479., 110000., + 118881., 128128., 137747., 147744., 158125., + 168896., 180063., 191632., 203609., 216000., + 172081., 182528., 193347., 204544., 216125., + 228096., 240463., 253232., 266409., 280000., + 294011., 308448., 323317., 338624., 354375., + 76., 154., 234., 316., 400., + 486., 574., 664., 756., 850., + 946., 1044., 1144., 1246., 1350.}); + x.linspace(1.); + + nd4j::ops::unsorted_segment_prod op; + + auto result = op.execute({&x, &idx}, {}, {4}); + ASSERT_EQ(result->status(), Status::OK()); + //result->at(0)->printIndexedBuffer("Output"); +// result->at(0)->printShapeInfo("Out Shape"); + //exp.printIndexedBuffer("Expect"); +// exp.printShapeInfo("Exp Shape"); + ASSERT_TRUE(exp.equalsTo(result->at(0))); + + delete result; +} + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProdBP_4) { auto x = NDArrayFactory::create('c', {8}, { diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu index 890021c76..6aafe4ea2 100644 --- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu +++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu @@ -39,7 +39,7 @@ TEST_F(CudaWorkspaceTests, Basic_Tests_1) { ctx.setWorkspace(&workspace); auto array = NDArrayFactory::create('c', {5, 5}, &ctx); - ASSERT_EQ(100, workspace.getCurrentOffset()); + ASSERT_EQ(108, workspace.getCurrentOffset()); ASSERT_EQ(0, workspace.getCurrentSecondaryOffset()); array.e(0); @@ -55,6 +55,6 @@ TEST_F(CudaWorkspaceTests, Basic_Tests_2) { ctx.setWorkspace(&workspace); auto array = NDArrayFactory::create('c', {5, 5}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, &ctx); - ASSERT_EQ(100, workspace.getCurrentOffset()); + ASSERT_EQ(108, workspace.getCurrentOffset()); ASSERT_EQ(0, workspace.getCurrentSecondaryOffset()); } \ No newline at end of file diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java index 8826858e5..b1de641b4 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java @@ -86,6 +86,7 @@ public class ImportClassMapping { org.nd4j.linalg.api.ops.impl.image.CropAndResize.class, org.nd4j.linalg.api.ops.impl.image.ExtractImagePatches.class, org.nd4j.linalg.api.ops.impl.image.NonMaxSuppression.class, + org.nd4j.linalg.api.ops.impl.image.NonMaxSuppressionV3.class, org.nd4j.linalg.api.ops.impl.image.ResizeBilinear.class, org.nd4j.linalg.api.ops.impl.image.ResizeNearestNeighbor.class, org.nd4j.linalg.api.ops.impl.indexaccum.FirstIndex.class, diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppression.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppression.java index 25463030c..d7161cf5f 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppression.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppression.java @@ -53,7 +53,7 @@ public class NonMaxSuppression extends DynamicCustomOp { @Override public String[] tensorflowNames() { - return new String[]{"NonMaxSuppression", "NonMaxSuppressionV2", "NonMaxSuppressionV3"}; + return new String[]{"NonMaxSuppression", "NonMaxSuppressionV2"}; } @Override diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppressionV3.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppressionV3.java new file mode 100644 index 000000000..d087287cf --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/image/NonMaxSuppressionV3.java @@ -0,0 +1,79 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.api.ops.impl.image; + +import lombok.NonNull; +import org.nd4j.autodiff.samediff.SDVariable; +import org.nd4j.autodiff.samediff.SameDiff; +import org.nd4j.imports.NoOpNameFoundException; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ops.DynamicCustomOp; +import org.nd4j.linalg.api.ops.Op; + +import java.util.Collections; +import java.util.List; + +/** + * Non max suppression + * + * @author raver119@gmail.com + */ +public class NonMaxSuppressionV3 extends DynamicCustomOp { + + public NonMaxSuppressionV3() {} + + public NonMaxSuppressionV3(SameDiff sameDiff, @NonNull SDVariable boxes, @NonNull SDVariable scores, @NonNull SDVariable maxOutSize, + @NonNull SDVariable iouThreshold, @NonNull SDVariable scoreThreshold) { + super(null, sameDiff, new SDVariable[]{boxes, scores, maxOutSize, iouThreshold, scoreThreshold}, false); + } + + @Override + public String onnxName() { + throw new NoOpNameFoundException("No onnx name found for shape " + opName()); + } + + @Override + public String tensorflowName() { + return "NonMaxSuppressionV3"; + } + + @Override + public String[] tensorflowNames() { + return new String[]{"NonMaxSuppressionV3","NonMaxSuppressionV4"}; + } + + @Override + public String opName() { + return "non_max_suppression_v3"; + } + + @Override + public Op.Type opType() { + return Op.Type.CUSTOM; + } + + @Override + public List doDiff(List i_v) { + return Collections.singletonList(sameDiff.zerosLike(arg())); + } + + @Override + public List calculateOutputDataTypes(List inputDataTypes){ + //Always 1D integer tensor (indices) + return Collections.singletonList(DataType.INT); + } +} diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index ad6807608..85c1f92f4 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -20827,7 +20827,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #endif /** - * image.non_max_suppression op. + * image.non_max_suppression ops. * input: * 0 - boxes - 2D-tensor with shape (num_boxes, 4) by float type * 1 - scales - 1D-tensor with shape (num_boxes) by float type @@ -20859,6 +20859,23 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); } // #endif +// #if NOT_EXCLUDED(OP_image_non_max_suppression_v3) + @Namespace("nd4j::ops") public static class non_max_suppression_v3 extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public non_max_suppression_v3(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public non_max_suppression_v3(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public non_max_suppression_v3 position(long position) { + return (non_max_suppression_v3)super.position(position); + } + + public non_max_suppression_v3() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } +// #endif /* * image.non_max_suppression_overlaps op.