From 3de3cd8277bc6c2d1a3fa23dff511daab44b0b87 Mon Sep 17 00:00:00 2001 From: raver119 Date: Thu, 13 Feb 2020 20:59:35 +0300 Subject: [PATCH] R119 tests (#238) * one small test Signed-off-by: raver119 * one small test Signed-off-by: raver119 * bert test Signed-off-by: raver119 * Graph FlowPath fix Signed-off-by: raver119 * - GraphProfiler tweaks - NodeProfile now includes shapes Signed-off-by: raver119 * RELU_layer inplace tweak Signed-off-by: raver119 * meh Signed-off-by: raver119 * identity tweaks Signed-off-by: raver119 * bert result validation Signed-off-by: raver119 * - bunch of Shape ops have inplace exec forbidden now - Legacy ops have inplace exec disabled by default now Signed-off-by: raver119 * ffast-math enabled Signed-off-by: raver119 * ffast-math enabled Signed-off-by: raver119 * allow some legacy ops to be inplace Signed-off-by: raver119 * disable -fast_math Signed-off-by: raver119 * disable expensive test for cuda Signed-off-by: raver119 --- libnd4j/blas/cpu/GraphExecutioner.cpp | 6 +- libnd4j/include/graph/VariableProxy.h | 1 + libnd4j/include/graph/VariableSpace.h | 1 + libnd4j/include/graph/impl/Variable.cpp | 5 +- libnd4j/include/graph/impl/VariableProxy.cpp | 3 + libnd4j/include/graph/impl/VariableSpace.cpp | 25 ++++- libnd4j/include/graph/profiling/NodeProfile.h | 17 ++- .../graph/profiling/impl/GraphProfile.cpp | 23 +++- .../profiling/impl/GraphProfilingHelper.cpp | 2 +- .../graph/profiling/impl/NodeProfile.cpp | 41 ++++++- libnd4j/include/ops/declarable/OpDescriptor.h | 3 + .../generic/activations/identity.cpp | 12 +- .../ops/declarable/generic/nn/relu_layer.cpp | 13 +-- .../ops/declarable/generic/shape/reshape.cpp | 2 +- .../declarable/generic/shape/reshape_as.cpp | 2 +- .../generic/shape/tile_to_shape.cpp | 2 +- .../declarable/generic/shape/transpose.cpp | 2 +- .../include/ops/declarable/headers/shape.h | 16 +-- .../ops/declarable/impl/DeclarableOp.cpp | 26 ++++- .../include/ops/declarable/impl/LegacyOp.cpp | 4 +- .../impl/LegacyPairwiseTransformOp.cpp | 4 +- .../ops/declarable/impl/LegacyScalarOp.cpp | 10 +- .../declarable/impl/LegacyTransformSameOp.cpp | 4 +- .../impl/LegacyTransformStrictOp.cpp | 4 +- .../ops/declarable/impl/OpDescriptor.cpp | 3 + libnd4j/tests_cpu/layers_tests/CMakeLists.txt | 2 +- .../layers_tests/DeclarableOpsTests10.cpp | 4 + .../layers_tests/DeclarableOpsTests14.cpp | 13 +++ .../layers_tests/DeclarableOpsTests15.cpp | 14 +++ .../layers_tests/PlaygroundTests.cpp | 104 ++++++++++++++++++ 30 files changed, 306 insertions(+), 62 deletions(-) diff --git a/libnd4j/blas/cpu/GraphExecutioner.cpp b/libnd4j/blas/cpu/GraphExecutioner.cpp index 2190afbf1..98b3204cd 100644 --- a/libnd4j/blas/cpu/GraphExecutioner.cpp +++ b/libnd4j/blas/cpu/GraphExecutioner.cpp @@ -179,7 +179,7 @@ namespace graph { nd4j_debug("Embedded graph execution finished. %i variable(s) migrated\n", cnt); } else if (node->hasCustomOp()) { - // if we have something to execute - lets just execute it. + // now, if we have something to execute - lets just execute it. auto status = node->getCustomOp()->execute(&context); if (status != ND4J_STATUS_OK) return status; @@ -494,8 +494,10 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace) nd4j::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m); } - if (tempFlow) + if (tempFlow) { delete flowPath; + __variableSpace->setFlowPath(nullptr); + } return Status::OK(); } diff --git a/libnd4j/include/graph/VariableProxy.h b/libnd4j/include/graph/VariableProxy.h index 1c253e9d8..c2a6e9c62 100644 --- a/libnd4j/include/graph/VariableProxy.h +++ b/libnd4j/include/graph/VariableProxy.h @@ -58,6 +58,7 @@ namespace nd4j { virtual void putVariable(int id, Variable *variable); virtual void putVariable(int id, NDArray *array); virtual void putVariable(int id, int idx, NDArray *array); + virtual void putVariable(int id, int idx, NDArray &array); virtual void putVariable(int id, int idx, Variable *array); virtual void replaceVariable(Variable *variable); diff --git a/libnd4j/include/graph/VariableSpace.h b/libnd4j/include/graph/VariableSpace.h index 9443d34b1..81abaf6e8 100644 --- a/libnd4j/include/graph/VariableSpace.h +++ b/libnd4j/include/graph/VariableSpace.h @@ -100,6 +100,7 @@ namespace nd4j { virtual void putVariable(int id, Variable *variable); virtual void putVariable(int id, NDArray *array); virtual void putVariable(int id, int idx, NDArray *array); + virtual void putVariable(int id, int idx, NDArray &array); virtual void putVariable(int id, int idx, Variable *array); virtual void dropVariable(std::pair &pair); diff --git a/libnd4j/include/graph/impl/Variable.cpp b/libnd4j/include/graph/impl/Variable.cpp index 5b8f00b25..c2c5ff61f 100644 --- a/libnd4j/include/graph/impl/Variable.cpp +++ b/libnd4j/include/graph/impl/Variable.cpp @@ -60,8 +60,11 @@ namespace nd4j { result->_name = this->_name; result->_index = this->_index; - if (this->_ndarray != nullptr) + if (this->_ndarray != nullptr) { result->_ndarray = new NDArray(this->_ndarray->dup(this->_ndarray->ordering())); + result->_readOnly = false; + result->_removable = true; + } if (this->_list != nullptr) result->_list = this->_list->clone(); diff --git a/libnd4j/include/graph/impl/VariableProxy.cpp b/libnd4j/include/graph/impl/VariableProxy.cpp index 85664f24a..e8abf1310 100644 --- a/libnd4j/include/graph/impl/VariableProxy.cpp +++ b/libnd4j/include/graph/impl/VariableProxy.cpp @@ -191,6 +191,9 @@ namespace nd4j { _current->putVariable(id, array); } + void nd4j::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) { + _current->putVariable(id, idx, array); + } void VariableProxy::putVariable(int id, int idx, NDArray *array) { _current->putVariable(id, idx, array); diff --git a/libnd4j/include/graph/impl/VariableSpace.cpp b/libnd4j/include/graph/impl/VariableSpace.cpp index 8318befb0..735f0260a 100644 --- a/libnd4j/include/graph/impl/VariableSpace.cpp +++ b/libnd4j/include/graph/impl/VariableSpace.cpp @@ -263,19 +263,19 @@ namespace nd4j { void nd4j::graph::VariableSpace::putVariable(int id, Variable *variable) { // we don't want to add variables more then once if (_variables.count(id) > 0 || _temporary.count(id) > 0) { - // nd4j_verbose("Trying to update variable for node_%i\n", id); - auto local = id < 0 ? _variables.at(id) : _temporary.at(id); if (!local->hasNDArray() && variable->hasNDArray()) { - // nd4j_verbose("Saving variable for node_%i\n", id); local->setNDArray(variable->getNDArray()); + + // we're inheriting this from Variable + local->markReadOnly(variable->isReadOnly()); + local->markRemovable(variable->isRemovable()); } + return; } - //nd4j_debug("Adding Variable to Space: id: %i; Array is null: %i;\n", id, variable->getNDArray() == nullptr); - _varmap.lock(); _handles->emplace_back(variable); @@ -314,6 +314,21 @@ namespace nd4j { } } + void nd4j::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) { + auto *var = new nd4j::graph::Variable(&array, "", id, idx); + var->markRemovable(false); + var->markReadOnly(true); + + // let's see if this op needs + bool d = this->hasVariable(id, idx); + + this->putVariable(id, var); + + // if var for this nodeid already exists - we'll just delete variable + if (d) + delete var; + } + void nd4j::graph::VariableSpace::putVariable(int id, NDArray *array) { auto *var = new nd4j::graph::Variable(array); this->putVariable(id, var); diff --git a/libnd4j/include/graph/profiling/NodeProfile.h b/libnd4j/include/graph/profiling/NodeProfile.h index 51b02326d..62df0c34a 100644 --- a/libnd4j/include/graph/profiling/NodeProfile.h +++ b/libnd4j/include/graph/profiling/NodeProfile.h @@ -24,6 +24,7 @@ #include #include #include +#include namespace nd4j { namespace graph { @@ -65,6 +66,9 @@ namespace nd4j { // total amount of memory used during execution Nd4jLong _memoryTotal = 0L; + + std::vector _inputShapes; + std::vector _outputShapes; public: NodeProfile() = default; ~NodeProfile() = default; @@ -84,10 +88,15 @@ namespace nd4j { void setObjectsSize(Nd4jLong bytes); void setTotalSize(Nd4jLong bytes); - Nd4jLong getActivationsSize(); - Nd4jLong getTemporarySize(); - Nd4jLong getObjectsSize(); - Nd4jLong getTotalSize(); + void addInputShape(Nd4jLong *shapeInfo); + void addOutputShape(Nd4jLong *shapeInfo); + + Nd4jLong getActivationsSize() const; + Nd4jLong getTemporarySize() const; + Nd4jLong getObjectsSize() const; + Nd4jLong getTotalSize() const; + + Nd4jLong getExecutionTime() const; std::string& name(); diff --git a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp index 6c7cccc01..ea8e7bc49 100644 --- a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include namespace nd4j { namespace graph { @@ -184,9 +186,26 @@ namespace nd4j { if (_profiles.empty()) nd4j_printf("No nodes in graph\n",""); - for (auto v: _profiles) + // printint out stuff + std::vector sorted; + for (auto v: _profiles) { v->printOut(); - + sorted.emplace_back(v); + } + + if (_profiles.size() > 1) { + // building hot spots + std::sort(sorted.begin(), sorted.end(), [](const NodeProfile *a, const NodeProfile *b) -> bool { + return a->getExecutionTime() > b->getExecutionTime(); + }); + + nd4j_printf("\nTop 30 reports by EXEC:\n", ""); + auto limit = nd4j::math::nd4j_min(30, sorted.size()); + for (int e = 0; e < limit; e++) { + sorted[e]->printOut(); + } + } + nd4j_printf("\nSpecial timers:\n", ""); if (_timings.empty()) nd4j_printf("No special timers were set\n",""); diff --git a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp index 025cd8651..cbea09616 100644 --- a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp +++ b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp @@ -32,7 +32,7 @@ namespace nd4j { // graph->printOut(); // warm up - for (int e = 0; e < 1000; e++) { + for (int e = 0; e < iterations; e++) { FlowPath fp; auto _vs = varSpace->clone(); diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp index ab5d2a4c4..c8b00e788 100644 --- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace graph { @@ -35,9 +36,23 @@ namespace nd4j { nd4j_printf(" Memory: ACT: %lld; TMP: %lld; OBJ: %lld; TTL: %lld;\n", _memoryActivations / _merges, _memoryTemporary / _merges, _memoryObjects / _merges, _memoryTotal / _merges); nd4j_printf(" Time: PREP: %lld ns; EXEC: %lld ns; TTL: %lld ns;\n", _preparationTime / _merges, _executionTime / _merges, _totalTime / _merges); nd4j_printf(" PREP: INPUT: %lld ns; SHAPE: %lld ns; ARRAY: %lld ns;\n", _inputTime / _merges, _shapeTime / _merges, _arrayTime / _merges); + + std::string inputs; + std::string outputs; + + int cnt = 0; + for (const auto &v: _inputShapes) + inputs += v + " "; + + for (const auto &v: _outputShapes) + outputs += v + " "; + + + nd4j_printf(" Inputs: %s\n", inputs.c_str()); + nd4j_printf(" Outputs: %s\n", outputs.c_str()); }; - Nd4jLong NodeProfile::getActivationsSize() { + Nd4jLong NodeProfile::getActivationsSize() const { return _memoryActivations; } @@ -53,15 +68,15 @@ namespace nd4j { _inputTime = time; } - Nd4jLong NodeProfile::getTemporarySize() { + Nd4jLong NodeProfile::getTemporarySize() const{ return _memoryTemporary; } - Nd4jLong NodeProfile::getObjectsSize() { + Nd4jLong NodeProfile::getObjectsSize() const{ return _memoryObjects; } - Nd4jLong NodeProfile::getTotalSize() { + Nd4jLong NodeProfile::getTotalSize() const{ return _memoryTotal; } @@ -97,6 +112,18 @@ namespace nd4j { _memoryTotal = bytes; } + Nd4jLong NodeProfile::getExecutionTime() const { + return _executionTime; + } + + void NodeProfile::addInputShape(Nd4jLong *shapeInfo) { + _inputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + } + + void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) { + _outputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + } + void NodeProfile::merge(NodeProfile *other) { _merges += other->_merges; _memoryObjects += other->_memoryObjects; @@ -110,6 +137,9 @@ namespace nd4j { _shapeTime += other->_shapeTime; _arrayTime += other->_arrayTime; _inputTime += other->_inputTime; + + _inputShapes = other->_inputShapes; + _outputShapes = other->_outputShapes; } std::string& NodeProfile::name() { @@ -129,6 +159,9 @@ namespace nd4j { _shapeTime = other->_shapeTime; _arrayTime = other->_arrayTime; _inputTime = other->_inputTime; + + _inputShapes = other->_inputShapes; + _outputShapes = other->_outputShapes; } } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/OpDescriptor.h b/libnd4j/include/ops/declarable/OpDescriptor.h index 2c857f3c0..302559ad8 100644 --- a/libnd4j/include/ops/declarable/OpDescriptor.h +++ b/libnd4j/include/ops/declarable/OpDescriptor.h @@ -147,6 +147,9 @@ namespace nd4j { // returns TRUE if this op allows in-place execution bool allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + void allowInplace(bool reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) int getOpNum(); diff --git a/libnd4j/include/ops/declarable/generic/activations/identity.cpp b/libnd4j/include/ops/declarable/generic/activations/identity.cpp index 5ae5b0690..e424772fc 100644 --- a/libnd4j/include/ops/declarable/generic/activations/identity.cpp +++ b/libnd4j/include/ops/declarable/generic/activations/identity.cpp @@ -27,12 +27,10 @@ namespace nd4j { namespace ops { OP_IMPL(identity, 1, 1, true) { auto first = INPUT_VARIABLE(0); - auto z = this->getZ(block); + auto z = OUTPUT_VARIABLE(0); - // just for lulz - first->applyTransform(nd4j::transform::Identity, *z); - - STORE_RESULT(*z); + if (!block.isInplace()) + first->applyTransform(nd4j::transform::Identity, *z); return Status::OK(); } @@ -60,8 +58,8 @@ namespace nd4j { DECLARE_TYPES(identity_bp) { getOpDescriptor() ->setAllowedInputTypes(0, DataType::ANY) - ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}); + ->setAllowedInputTypes(1, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}); } } } diff --git a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp index cfc080117..22c7a9137 100644 --- a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp @@ -31,22 +31,17 @@ namespace nd4j { REQUIRE_TRUE(w->isMatrix(), 0, "relu_layer: weights argument should be a 2D tensor, but got rank %i instead!", w->rankOf()); REQUIRE_TRUE(b->isVector(), 0, "relu_layer: biases argument should be a 1D tensor, but got rank %i instead!", b->rankOf()); REQUIRE_TRUE(b->lengthOf() == w->sizeAt(1), 0, "relu_layer: biases array length should match to columns of weights matrix, however got length = %i and columns = %i!", b->lengthOf(), w->sizeAt(1)); - REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", - x->sizeAt(1), w->sizeAt(0)); - + REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", x->sizeAt(1), w->sizeAt(0)); auto output = OUTPUT_VARIABLE(0); - //T bound = (T)0.f; - //nd4j_printf("Matrix x(%ix%i), Matrix w(%ix%i), b(1x%i)\n", x->sizeAt(0), x->sizeAt(1), w->sizeAt(0), w->sizeAt(1), b->lengthOf()); nd4j::ops::xw_plus_b op; - std::unique_ptr result(op.evaluate({x, w, b})); - REQUIRE_TRUE(Status::OK() == result->status(), 0, "relu_layer: xw_plus_b op failed on input data."); + auto status = op.execute({x, w, b}, {output}); + REQUIRE_TRUE(Status::OK() == status, 0, "relu_layer: xw_plus_b op failed on input data."); auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0; - auto xw = result->at(0); - xw->applyScalar(nd4j::scalar::RELU, scalar, *output); + output->applyScalar(nd4j::scalar::RELU, scalar, *output); return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp index 1d76138f2..b8d582481 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp @@ -28,7 +28,7 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// // here iArgs is a vector with (optional) negative of order as first element: // ({-order, dim1, dim2, dim3, ...}) - CUSTOM_OP_IMPL(reshape, 1, 1, true, 0, -2) { + CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) { auto x = INPUT_VARIABLE(0); if (block.width() == 1) { diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp index 92dc2a146..75aafc06f 100644 --- a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp @@ -28,7 +28,7 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// - CUSTOM_OP_IMPL(reshapeas, 2, 1, true, 0, 0) { + CUSTOM_OP_IMPL(reshapeas, 2, 1, false, 0, 0) { auto x = INPUT_VARIABLE(0); auto y = INPUT_VARIABLE(1); diff --git a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp index cc88fb46c..d71fbddd5 100644 --- a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp @@ -25,7 +25,7 @@ namespace nd4j { namespace ops { - CUSTOM_OP_IMPL(tile_to_shape, 1, 1, true, 0, -1) { + CUSTOM_OP_IMPL(tile_to_shape, 1, 1, false, 0, -1) { auto input = INPUT_VARIABLE(0); auto output = OUTPUT_VARIABLE(0); diff --git a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp index 5d01b8bbf..15ed67744 100644 --- a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp @@ -28,7 +28,7 @@ namespace nd4j { namespace ops { ////////////////////////////////////////////////////////////////////////// - CUSTOM_OP_IMPL(transpose, 1, 1, true, 0, 0) { + CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) { auto x = INPUT_VARIABLE(0); if (block.width() == 1) { if (block.isInplace()) { diff --git a/libnd4j/include/ops/declarable/headers/shape.h b/libnd4j/include/ops/declarable/headers/shape.h index 3d47c24bf..c21cdb84d 100644 --- a/libnd4j/include/ops/declarable/headers/shape.h +++ b/libnd4j/include/ops/declarable/headers/shape.h @@ -26,15 +26,15 @@ namespace nd4j { namespace ops { #if NOT_EXCLUDED(OP_permute) - DECLARE_CUSTOM_OP(permute, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(permute, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_reshapeas) - DECLARE_CUSTOM_OP(reshapeas, 2, 1, true, 0, 0); + DECLARE_CUSTOM_OP(reshapeas, 2, 1, false, 0, 0); #endif #if NOT_EXCLUDED(OP_transpose) - DECLARE_CUSTOM_OP(transpose, 1, 1, true, 0, 0); + DECLARE_CUSTOM_OP(transpose, 1, 1, false, 0, 0); #endif #if NOT_EXCLUDED(OP_shape_of) @@ -46,7 +46,7 @@ namespace nd4j { #endif #if NOT_EXCLUDED(OP_squeeze) - DECLARE_CUSTOM_OP(squeeze, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(squeeze, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_expand_dims) @@ -54,11 +54,11 @@ namespace nd4j { #endif #if NOT_EXCLUDED(OP_reshape) - DECLARE_CUSTOM_OP(reshape, 1, 1, true, 0, -2); + DECLARE_CUSTOM_OP(reshape, 1, 1, false, 0, -2); #endif #if NOT_EXCLUDED(OP_size_at) - DECLARE_CUSTOM_OP(size_at, 1, 1, true, 0, 1); + DECLARE_CUSTOM_OP(size_at, 1, 1, false, 0, 1); #endif /** @@ -80,8 +80,8 @@ namespace nd4j { * @tparam T */ #if NOT_EXCLUDED(OP_tile_to_shape) - DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, true, 0, -1); - DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, true, 0, -1); + DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, false, 0, -1); + DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, false, 0, -1); #endif /** diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp index 46d10b51c..9724b6ba5 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp @@ -150,6 +150,22 @@ namespace nd4j { } if (ctx.isInplace()) { + if (Environment::getInstance()->isProfiling() && node != nullptr) { + if (ctx.isFastPath()) { + // + } else { + for (auto p: *ctx.inputs()) { + auto var = ctx.variable(p); + if (var->variableType() == VariableType::NDARRAY) { + NDArray *array = var->getNDArray(); + + node->addInputShape(array->shapeInfo()); + node->addOutputShape(array->shapeInfo()); + } + } + } + } + // do nothing, getZ result will do the trick return static_cast(ctx.width()); } else { @@ -192,6 +208,10 @@ namespace nd4j { auto inputTime = std::chrono::duration_cast(inputEnd - inputStart).count(); node->setInputTime(inputTime); + // saving output shapes in profile + for (int e = 0; e < inSha.size(); e++) + node->addInputShape(inSha.at(e)); + shapeStart = std::chrono::system_clock::now(); } @@ -204,6 +224,10 @@ namespace nd4j { auto prepTime = std::chrono::duration_cast(shapeEnd - shapeStart).count(); node->setShapeFunctionTime(prepTime); + // saving output shapes in profile + for (int e = 0; e < outSha->size(); e++) + node->addOutputShape(outSha->at(e)); + arrayStart = std::chrono::system_clock::now(); } @@ -562,7 +586,7 @@ namespace nd4j { block->setInnerTime(outerTime); } - if (Environment::getInstance()->isProfiling()) { + if (Environment::getInstance()->isProfiling() && !block->isFastPath()) { auto fp = block->getVariableSpace()->flowPath(); if (fp != nullptr) { auto p = fp->profile(); diff --git a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp index f93df63f1..e9920c409 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp @@ -23,11 +23,11 @@ namespace nd4j { namespace ops { - LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) { + LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) { _numInputs = numInputs; } - LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) { + LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) { _opNum = opNum; _numInputs = numInputs; } diff --git a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp index 07c7234f5..49f896be1 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp @@ -25,11 +25,11 @@ namespace nd4j { namespace ops { LegacyPairwiseTransformOp::LegacyPairwiseTransformOp() : LegacyOp::LegacyOp(2) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyPairwiseTransformOp::LegacyPairwiseTransformOp(int opNum) : LegacyOp::LegacyOp(2, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyPairwiseTransformOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp index b1261b37c..856bfdeaf 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyScalarOp::LegacyScalarOp() : LegacyOp::LegacyOp(1) { - // no-op + this->getOpDescriptor()->allowInplace(true); } LegacyScalarOp::LegacyScalarOp(int opNum) : LegacyOp::LegacyOp(1, opNum){ - // no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyScalarOp::clone() { @@ -66,6 +66,7 @@ namespace nd4j { NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y->buffer(), y->shapeInfo(), y->specialBuffer(), y->specialShapeInfo(), extras.argumentsAsT(z->dataType())); + NDArray::registerSpecialUse({z}, {x, y}); } else if (block.getTArguments()->size() > 0) { auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext()); @@ -78,10 +79,9 @@ namespace nd4j { NDArray::prepareSpecialUse({z}, {x, _scalar}); NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), _scalar->buffer(), _scalar->shapeInfo(), _scalar->specialBuffer(), _scalar->specialShapeInfo(), extras.argumentsAsT(z->dataType())); - } - manager.synchronize(); - STORE_RESULT(*z); + NDArray::registerSpecialUse({z}, {x, _scalar}); + } return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp index 49fef3af0..6b097c3af 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyTransformSameOp::LegacyTransformSameOp() : LegacyOp::LegacyOp(1) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyTransformSameOp::LegacyTransformSameOp(int opNum) : LegacyOp::LegacyOp(1, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyTransformSameOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp index 19a51191a..a390a458c 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp @@ -26,11 +26,11 @@ namespace nd4j { namespace ops { LegacyTransformStrictOp::LegacyTransformStrictOp() : LegacyOp::LegacyOp(1) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyTransformStrictOp::LegacyTransformStrictOp(int opNum) : LegacyOp::LegacyOp(1, opNum) { - // just a no-op + this->getOpDescriptor()->allowInplace(true); } LegacyOp* LegacyTransformStrictOp::clone() { diff --git a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp index 5139a95cc..417fc0605 100644 --- a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp +++ b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp @@ -50,6 +50,9 @@ namespace nd4j { _scalar = isScalar; } + void OpDescriptor::allowInplace(bool reallyAllow){ + _allowsInplace = reallyAllow; + } bool OpDescriptor::operator==(const OpDescriptor& other) const { if (_hash == -1 && other._hash == -1) diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt index f538eb9cd..17ae714cd 100644 --- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt @@ -52,7 +52,7 @@ elseif(WIN32) set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") endif() else() - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp index a0722f9d0..484719a45 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp @@ -3087,6 +3087,10 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_3) { //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif NDArray x = NDArrayFactory::create('c', {2,4,5,3}); NDArray exp = NDArrayFactory::create('c', {2,4,5,3},{ diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 600004ec2..7592bee27 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -78,6 +78,11 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_1) { } TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {5}, {1, 2, 3, std::numeric_limits::infinity(), 5}); auto y = NDArrayFactory::create('c', {5}, {1, 2, 3, -std::numeric_limits::infinity(), 5}); @@ -332,6 +337,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_max_1) { } TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif auto e = NDArrayFactory::create('c', {1, 0}); nd4j::ops::reduce_sum sumOp; @@ -343,6 +352,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) { } TEST_F(DeclarableOpsTests14, test_empty_reduce_mean_1) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif auto e = NDArrayFactory::create('c', {1, 0}); nd4j::ops::reduce_mean sumOp; diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp index ee8691bbb..199630d4e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp @@ -584,6 +584,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_1) { } TEST_F(DeclarableOpsTests15, test_check_numeric_2) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {3},{1.f, 2.f, std::numeric_limits::infinity()}); auto y = NDArrayFactory::string("should trigger"); auto z = NDArrayFactory::create('c', {3} ); @@ -598,6 +603,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_2) { } TEST_F(DeclarableOpsTests15, test_check_numeric_3) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif + auto x = NDArrayFactory::create('c', {3},{1.f, 2.f, std::numeric_limits::quiet_NaN()}); auto y = NDArrayFactory::string("should trigger"); auto z = NDArrayFactory::create('c', {3} ); @@ -1530,6 +1540,10 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test10) { } TEST_F(DeclarableOpsTests15, Pow_BP_Test11) { +#ifdef FFAST_MATH + if (1 > 0) + return; +#endif NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32); NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32); diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 9f75beca1..7db7a791a 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -65,6 +65,110 @@ TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } + +TEST_F(PlaygroundTests, test_bert_1) { + // this test will run ONLY if this model exists + if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0) + return; + + auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb"); + + auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext.numpy"); + auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_1.numpy"); + auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_4.numpy"); + auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model_output.numpy"); + + //graph->printOut(); + + graph->tagInplaceNodes(); + + graph->getVariableSpace()->putVariable(85,0, t); + graph->getVariableSpace()->putVariable(86,0, u); + graph->getVariableSpace()->putVariable(87,0, v); + +/* + // validating graph now + auto status = GraphExecutioner::execute(graph); + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198)); + + auto array = graph->getVariableSpace()->getVariable(198)->getNDArray(); + ASSERT_EQ(z, *array); +*/ + + nd4j::Environment::getInstance()->setProfiling(true); + auto profile = GraphProfilingHelper::profile(graph, 1); + + profile->printOut(); + + nd4j::Environment::getInstance()->setProfiling(false); + delete profile; + +/* + std::vector values; + + for (int e = 0; e < 1; e++) { + auto timeStart = std::chrono::system_clock::now(); + + GraphExecutioner::execute(graph); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); +*/ + + delete graph; +} + +/* +TEST_F(PlaygroundTests, test_broadcast_1) { + int pool = 10; + std::vector aX(pool); + std::vector aY(pool); + std::vector aZ(pool); + + for (int e = 0; e < pool; e++) { + aX[e] = NDArrayFactory::create_('c', {64, 128, 1}); + aY[e] = NDArrayFactory::create_('c', {768}); + aZ[e] = NDArrayFactory::create_('c', {64, 128, 768}); + + aX[e]->assign(119 * (e+1)); + aY[e]->assign(119 * (e+3)); + } + + std::vector values; + + for (int e = 0; e < 1000; e++) { + auto x = aX[e < pool ? e : e % pool]; + auto y = aY[e < pool ? e : e % pool]; + auto z = aZ[e < pool ? e : e % pool]; + + auto timeStart = std::chrono::system_clock::now(); + + x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); + + for (int e = 0; e < pool; e++) { + delete aX[e]; + delete aY[e]; + delete aZ[e]; + } +} +*/ + /* TEST_F(PlaygroundTests, test_s_0) {