R119 tests (#238)
* one small test Signed-off-by: raver119 <raver119@gmail.com> * one small test Signed-off-by: raver119 <raver119@gmail.com> * bert test Signed-off-by: raver119 <raver119@gmail.com> * Graph FlowPath fix Signed-off-by: raver119 <raver119@gmail.com> * - GraphProfiler tweaks - NodeProfile now includes shapes Signed-off-by: raver119 <raver119@gmail.com> * RELU_layer inplace tweak Signed-off-by: raver119 <raver119@gmail.com> * meh Signed-off-by: raver119 <raver119@gmail.com> * identity tweaks Signed-off-by: raver119 <raver119@gmail.com> * bert result validation Signed-off-by: raver119 <raver119@gmail.com> * - bunch of Shape ops have inplace exec forbidden now - Legacy ops have inplace exec disabled by default now Signed-off-by: raver119 <raver119@gmail.com> * ffast-math enabled Signed-off-by: raver119 <raver119@gmail.com> * ffast-math enabled Signed-off-by: raver119 <raver119@gmail.com> * allow some legacy ops to be inplace Signed-off-by: raver119 <raver119@gmail.com> * disable -fast_math Signed-off-by: raver119 <raver119@gmail.com> * disable expensive test for cuda Signed-off-by: raver119 <raver119@gmail.com>master
parent
fe47f52896
commit
3de3cd8277
|
@ -179,7 +179,7 @@ namespace graph {
|
|||
nd4j_debug("Embedded graph execution finished. %i variable(s) migrated\n", cnt);
|
||||
|
||||
} else if (node->hasCustomOp()) {
|
||||
// if we have something to execute - lets just execute it.
|
||||
// now, if we have something to execute - lets just execute it.
|
||||
auto status = node->getCustomOp()->execute(&context);
|
||||
if (status != ND4J_STATUS_OK)
|
||||
return status;
|
||||
|
@ -494,8 +494,10 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
|
|||
nd4j::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m);
|
||||
}
|
||||
|
||||
if (tempFlow)
|
||||
if (tempFlow) {
|
||||
delete flowPath;
|
||||
__variableSpace->setFlowPath(nullptr);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -58,6 +58,7 @@ namespace nd4j {
|
|||
virtual void putVariable(int id, Variable *variable);
|
||||
virtual void putVariable(int id, NDArray *array);
|
||||
virtual void putVariable(int id, int idx, NDArray *array);
|
||||
virtual void putVariable(int id, int idx, NDArray &array);
|
||||
virtual void putVariable(int id, int idx, Variable *array);
|
||||
|
||||
virtual void replaceVariable(Variable *variable);
|
||||
|
|
|
@ -100,6 +100,7 @@ namespace nd4j {
|
|||
virtual void putVariable(int id, Variable *variable);
|
||||
virtual void putVariable(int id, NDArray *array);
|
||||
virtual void putVariable(int id, int idx, NDArray *array);
|
||||
virtual void putVariable(int id, int idx, NDArray &array);
|
||||
virtual void putVariable(int id, int idx, Variable *array);
|
||||
|
||||
virtual void dropVariable(std::pair<int,int> &pair);
|
||||
|
|
|
@ -60,8 +60,11 @@ namespace nd4j {
|
|||
result->_name = this->_name;
|
||||
result->_index = this->_index;
|
||||
|
||||
if (this->_ndarray != nullptr)
|
||||
if (this->_ndarray != nullptr) {
|
||||
result->_ndarray = new NDArray(this->_ndarray->dup(this->_ndarray->ordering()));
|
||||
result->_readOnly = false;
|
||||
result->_removable = true;
|
||||
}
|
||||
|
||||
if (this->_list != nullptr)
|
||||
result->_list = this->_list->clone();
|
||||
|
|
|
@ -191,6 +191,9 @@ namespace nd4j {
|
|||
_current->putVariable(id, array);
|
||||
}
|
||||
|
||||
void nd4j::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) {
|
||||
_current->putVariable(id, idx, array);
|
||||
}
|
||||
|
||||
void VariableProxy::putVariable(int id, int idx, NDArray *array) {
|
||||
_current->putVariable(id, idx, array);
|
||||
|
|
|
@ -263,19 +263,19 @@ namespace nd4j {
|
|||
void nd4j::graph::VariableSpace::putVariable(int id, Variable *variable) {
|
||||
// we don't want to add variables more then once
|
||||
if (_variables.count(id) > 0 || _temporary.count(id) > 0) {
|
||||
// nd4j_verbose("Trying to update variable for node_%i\n", id);
|
||||
|
||||
auto local = id < 0 ? _variables.at(id) : _temporary.at(id);
|
||||
|
||||
if (!local->hasNDArray() && variable->hasNDArray()) {
|
||||
// nd4j_verbose("Saving variable for node_%i\n", id);
|
||||
local->setNDArray(variable->getNDArray());
|
||||
|
||||
// we're inheriting this from Variable
|
||||
local->markReadOnly(variable->isReadOnly());
|
||||
local->markRemovable(variable->isRemovable());
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
//nd4j_debug("Adding Variable to Space: id: %i; Array is null: %i;\n", id, variable->getNDArray() == nullptr);
|
||||
|
||||
_varmap.lock();
|
||||
|
||||
_handles->emplace_back(variable);
|
||||
|
@ -314,6 +314,21 @@ namespace nd4j {
|
|||
}
|
||||
}
|
||||
|
||||
void nd4j::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) {
|
||||
auto *var = new nd4j::graph::Variable(&array, "", id, idx);
|
||||
var->markRemovable(false);
|
||||
var->markReadOnly(true);
|
||||
|
||||
// let's see if this op needs
|
||||
bool d = this->hasVariable(id, idx);
|
||||
|
||||
this->putVariable(id, var);
|
||||
|
||||
// if var for this nodeid already exists - we'll just delete variable
|
||||
if (d)
|
||||
delete var;
|
||||
}
|
||||
|
||||
void nd4j::graph::VariableSpace::putVariable(int id, NDArray *array) {
|
||||
auto *var = new nd4j::graph::Variable(array);
|
||||
this->putVariable(id, var);
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include <pointercast.h>
|
||||
#include <dll.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace nd4j {
|
||||
namespace graph {
|
||||
|
@ -65,6 +66,9 @@ namespace nd4j {
|
|||
|
||||
// total amount of memory used during execution
|
||||
Nd4jLong _memoryTotal = 0L;
|
||||
|
||||
std::vector<std::string> _inputShapes;
|
||||
std::vector<std::string> _outputShapes;
|
||||
public:
|
||||
NodeProfile() = default;
|
||||
~NodeProfile() = default;
|
||||
|
@ -84,10 +88,15 @@ namespace nd4j {
|
|||
void setObjectsSize(Nd4jLong bytes);
|
||||
void setTotalSize(Nd4jLong bytes);
|
||||
|
||||
Nd4jLong getActivationsSize();
|
||||
Nd4jLong getTemporarySize();
|
||||
Nd4jLong getObjectsSize();
|
||||
Nd4jLong getTotalSize();
|
||||
void addInputShape(Nd4jLong *shapeInfo);
|
||||
void addOutputShape(Nd4jLong *shapeInfo);
|
||||
|
||||
Nd4jLong getActivationsSize() const;
|
||||
Nd4jLong getTemporarySize() const;
|
||||
Nd4jLong getObjectsSize() const;
|
||||
Nd4jLong getTotalSize() const;
|
||||
|
||||
Nd4jLong getExecutionTime() const;
|
||||
|
||||
std::string& name();
|
||||
|
||||
|
|
|
@ -21,6 +21,8 @@
|
|||
#include <graph/profiling/GraphProfile.h>
|
||||
#include <helpers/logger.h>
|
||||
#include <chrono>
|
||||
#include <templatemath.h>
|
||||
#include <algorithm>
|
||||
|
||||
namespace nd4j {
|
||||
namespace graph {
|
||||
|
@ -184,9 +186,26 @@ namespace nd4j {
|
|||
if (_profiles.empty())
|
||||
nd4j_printf("No nodes in graph\n","");
|
||||
|
||||
for (auto v: _profiles)
|
||||
// printint out stuff
|
||||
std::vector<NodeProfile*> sorted;
|
||||
for (auto v: _profiles) {
|
||||
v->printOut();
|
||||
|
||||
sorted.emplace_back(v);
|
||||
}
|
||||
|
||||
if (_profiles.size() > 1) {
|
||||
// building hot spots
|
||||
std::sort(sorted.begin(), sorted.end(), [](const NodeProfile *a, const NodeProfile *b) -> bool {
|
||||
return a->getExecutionTime() > b->getExecutionTime();
|
||||
});
|
||||
|
||||
nd4j_printf("\nTop 30 reports by EXEC:\n", "");
|
||||
auto limit = nd4j::math::nd4j_min<int>(30, sorted.size());
|
||||
for (int e = 0; e < limit; e++) {
|
||||
sorted[e]->printOut();
|
||||
}
|
||||
}
|
||||
|
||||
nd4j_printf("\nSpecial timers:\n", "");
|
||||
if (_timings.empty())
|
||||
nd4j_printf("No special timers were set\n","");
|
||||
|
|
|
@ -32,7 +32,7 @@ namespace nd4j {
|
|||
// graph->printOut();
|
||||
|
||||
// warm up
|
||||
for (int e = 0; e < 1000; e++) {
|
||||
for (int e = 0; e < iterations; e++) {
|
||||
FlowPath fp;
|
||||
|
||||
auto _vs = varSpace->clone();
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include <helpers/logger.h>
|
||||
#include <graph/profiling/NodeProfile.h>
|
||||
#include <helpers/ShapeUtils.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace graph {
|
||||
|
@ -35,9 +36,23 @@ namespace nd4j {
|
|||
nd4j_printf(" Memory: ACT: %lld; TMP: %lld; OBJ: %lld; TTL: %lld;\n", _memoryActivations / _merges, _memoryTemporary / _merges, _memoryObjects / _merges, _memoryTotal / _merges);
|
||||
nd4j_printf(" Time: PREP: %lld ns; EXEC: %lld ns; TTL: %lld ns;\n", _preparationTime / _merges, _executionTime / _merges, _totalTime / _merges);
|
||||
nd4j_printf(" PREP: INPUT: %lld ns; SHAPE: %lld ns; ARRAY: %lld ns;\n", _inputTime / _merges, _shapeTime / _merges, _arrayTime / _merges);
|
||||
|
||||
std::string inputs;
|
||||
std::string outputs;
|
||||
|
||||
int cnt = 0;
|
||||
for (const auto &v: _inputShapes)
|
||||
inputs += v + " ";
|
||||
|
||||
for (const auto &v: _outputShapes)
|
||||
outputs += v + " ";
|
||||
|
||||
|
||||
nd4j_printf(" Inputs: %s\n", inputs.c_str());
|
||||
nd4j_printf(" Outputs: %s\n", outputs.c_str());
|
||||
};
|
||||
|
||||
Nd4jLong NodeProfile::getActivationsSize() {
|
||||
Nd4jLong NodeProfile::getActivationsSize() const {
|
||||
return _memoryActivations;
|
||||
}
|
||||
|
||||
|
@ -53,15 +68,15 @@ namespace nd4j {
|
|||
_inputTime = time;
|
||||
}
|
||||
|
||||
Nd4jLong NodeProfile::getTemporarySize() {
|
||||
Nd4jLong NodeProfile::getTemporarySize() const{
|
||||
return _memoryTemporary;
|
||||
}
|
||||
|
||||
Nd4jLong NodeProfile::getObjectsSize() {
|
||||
Nd4jLong NodeProfile::getObjectsSize() const{
|
||||
return _memoryObjects;
|
||||
}
|
||||
|
||||
Nd4jLong NodeProfile::getTotalSize() {
|
||||
Nd4jLong NodeProfile::getTotalSize() const{
|
||||
return _memoryTotal;
|
||||
}
|
||||
|
||||
|
@ -97,6 +112,18 @@ namespace nd4j {
|
|||
_memoryTotal = bytes;
|
||||
}
|
||||
|
||||
Nd4jLong NodeProfile::getExecutionTime() const {
|
||||
return _executionTime;
|
||||
}
|
||||
|
||||
void NodeProfile::addInputShape(Nd4jLong *shapeInfo) {
|
||||
_inputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo));
|
||||
}
|
||||
|
||||
void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) {
|
||||
_outputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo));
|
||||
}
|
||||
|
||||
void NodeProfile::merge(NodeProfile *other) {
|
||||
_merges += other->_merges;
|
||||
_memoryObjects += other->_memoryObjects;
|
||||
|
@ -110,6 +137,9 @@ namespace nd4j {
|
|||
_shapeTime += other->_shapeTime;
|
||||
_arrayTime += other->_arrayTime;
|
||||
_inputTime += other->_inputTime;
|
||||
|
||||
_inputShapes = other->_inputShapes;
|
||||
_outputShapes = other->_outputShapes;
|
||||
}
|
||||
|
||||
std::string& NodeProfile::name() {
|
||||
|
@ -129,6 +159,9 @@ namespace nd4j {
|
|||
_shapeTime = other->_shapeTime;
|
||||
_arrayTime = other->_arrayTime;
|
||||
_inputTime = other->_inputTime;
|
||||
|
||||
_inputShapes = other->_inputShapes;
|
||||
_outputShapes = other->_outputShapes;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -147,6 +147,9 @@ namespace nd4j {
|
|||
// returns TRUE if this op allows in-place execution
|
||||
bool allowsInplace();
|
||||
|
||||
// this method allows you to enable/disable inplace call for a given op
|
||||
void allowInplace(bool reallyAllow);
|
||||
|
||||
// this method returns opNum (applicable for legacy XYZ ops only)
|
||||
int getOpNum();
|
||||
|
||||
|
|
|
@ -27,12 +27,10 @@ namespace nd4j {
|
|||
namespace ops {
|
||||
OP_IMPL(identity, 1, 1, true) {
|
||||
auto first = INPUT_VARIABLE(0);
|
||||
auto z = this->getZ(block);
|
||||
auto z = OUTPUT_VARIABLE(0);
|
||||
|
||||
// just for lulz
|
||||
first->applyTransform(nd4j::transform::Identity, *z);
|
||||
|
||||
STORE_RESULT(*z);
|
||||
if (!block.isInplace())
|
||||
first->applyTransform(nd4j::transform::Identity, *z);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
@ -60,8 +58,8 @@ namespace nd4j {
|
|||
DECLARE_TYPES(identity_bp) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(0, DataType::ANY)
|
||||
->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
|
||||
->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
|
||||
->setAllowedInputTypes(1, {ALL_FLOATS})
|
||||
->setAllowedOutputTypes(0, {ALL_FLOATS});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,22 +31,17 @@ namespace nd4j {
|
|||
REQUIRE_TRUE(w->isMatrix(), 0, "relu_layer: weights argument should be a 2D tensor, but got rank %i instead!", w->rankOf());
|
||||
REQUIRE_TRUE(b->isVector(), 0, "relu_layer: biases argument should be a 1D tensor, but got rank %i instead!", b->rankOf());
|
||||
REQUIRE_TRUE(b->lengthOf() == w->sizeAt(1), 0, "relu_layer: biases array length should match to columns of weights matrix, however got length = %i and columns = %i!", b->lengthOf(), w->sizeAt(1));
|
||||
REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!",
|
||||
x->sizeAt(1), w->sizeAt(0));
|
||||
|
||||
REQUIRE_TRUE(x->sizeAt(1) == w->sizeAt(0), 0, "relu_layer: number of x columns should match to row number of weights matrix, but got x_columns = %i and weights_rows = %i!", x->sizeAt(1), w->sizeAt(0));
|
||||
|
||||
auto output = OUTPUT_VARIABLE(0);
|
||||
//T bound = (T)0.f;
|
||||
//nd4j_printf("Matrix x(%ix%i), Matrix w(%ix%i), b(1x%i)\n", x->sizeAt(0), x->sizeAt(1), w->sizeAt(0), w->sizeAt(1), b->lengthOf());
|
||||
|
||||
nd4j::ops::xw_plus_b op;
|
||||
std::unique_ptr<ResultSet> result(op.evaluate({x, w, b}));
|
||||
REQUIRE_TRUE(Status::OK() == result->status(), 0, "relu_layer: xw_plus_b op failed on input data.");
|
||||
auto status = op.execute({x, w, b}, {output});
|
||||
REQUIRE_TRUE(Status::OK() == status, 0, "relu_layer: xw_plus_b op failed on input data.");
|
||||
|
||||
auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
|
||||
|
||||
auto xw = result->at(0);
|
||||
xw->applyScalar(nd4j::scalar::RELU, scalar, *output);
|
||||
output->applyScalar(nd4j::scalar::RELU, scalar, *output);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace nd4j {
|
|||
//////////////////////////////////////////////////////////////////////////
|
||||
// here iArgs is a vector with (optional) negative of order as first element:
|
||||
// ({-order, dim1, dim2, dim3, ...})
|
||||
CUSTOM_OP_IMPL(reshape, 1, 1, true, 0, -2) {
|
||||
CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) {
|
||||
auto x = INPUT_VARIABLE(0);
|
||||
|
||||
if (block.width() == 1) {
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace nd4j {
|
|||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
CUSTOM_OP_IMPL(reshapeas, 2, 1, true, 0, 0) {
|
||||
CUSTOM_OP_IMPL(reshapeas, 2, 1, false, 0, 0) {
|
||||
|
||||
auto x = INPUT_VARIABLE(0);
|
||||
auto y = INPUT_VARIABLE(1);
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
CUSTOM_OP_IMPL(tile_to_shape, 1, 1, true, 0, -1) {
|
||||
CUSTOM_OP_IMPL(tile_to_shape, 1, 1, false, 0, -1) {
|
||||
|
||||
auto input = INPUT_VARIABLE(0);
|
||||
auto output = OUTPUT_VARIABLE(0);
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace nd4j {
|
|||
namespace ops {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
CUSTOM_OP_IMPL(transpose, 1, 1, true, 0, 0) {
|
||||
CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) {
|
||||
auto x = INPUT_VARIABLE(0);
|
||||
if (block.width() == 1) {
|
||||
if (block.isInplace()) {
|
||||
|
|
|
@ -26,15 +26,15 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
#if NOT_EXCLUDED(OP_permute)
|
||||
DECLARE_CUSTOM_OP(permute, 1, 1, true, 0, -2);
|
||||
DECLARE_CUSTOM_OP(permute, 1, 1, false, 0, -2);
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_reshapeas)
|
||||
DECLARE_CUSTOM_OP(reshapeas, 2, 1, true, 0, 0);
|
||||
DECLARE_CUSTOM_OP(reshapeas, 2, 1, false, 0, 0);
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_transpose)
|
||||
DECLARE_CUSTOM_OP(transpose, 1, 1, true, 0, 0);
|
||||
DECLARE_CUSTOM_OP(transpose, 1, 1, false, 0, 0);
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_shape_of)
|
||||
|
@ -46,7 +46,7 @@ namespace nd4j {
|
|||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_squeeze)
|
||||
DECLARE_CUSTOM_OP(squeeze, 1, 1, true, 0, -2);
|
||||
DECLARE_CUSTOM_OP(squeeze, 1, 1, false, 0, -2);
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_expand_dims)
|
||||
|
@ -54,11 +54,11 @@ namespace nd4j {
|
|||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_reshape)
|
||||
DECLARE_CUSTOM_OP(reshape, 1, 1, true, 0, -2);
|
||||
DECLARE_CUSTOM_OP(reshape, 1, 1, false, 0, -2);
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_size_at)
|
||||
DECLARE_CUSTOM_OP(size_at, 1, 1, true, 0, 1);
|
||||
DECLARE_CUSTOM_OP(size_at, 1, 1, false, 0, 1);
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@ -80,8 +80,8 @@ namespace nd4j {
|
|||
* @tparam T
|
||||
*/
|
||||
#if NOT_EXCLUDED(OP_tile_to_shape)
|
||||
DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, true, 0, -1);
|
||||
DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, true, 0, -1);
|
||||
DECLARE_CUSTOM_OP(tile_to_shape, 1, 1, false, 0, -1);
|
||||
DECLARE_CUSTOM_OP(tile_to_shape_bp, 2, 1, false, 0, -1);
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
|
|
@ -150,6 +150,22 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
if (ctx.isInplace()) {
|
||||
if (Environment::getInstance()->isProfiling() && node != nullptr) {
|
||||
if (ctx.isFastPath()) {
|
||||
//
|
||||
} else {
|
||||
for (auto p: *ctx.inputs()) {
|
||||
auto var = ctx.variable(p);
|
||||
if (var->variableType() == VariableType::NDARRAY) {
|
||||
NDArray *array = var->getNDArray();
|
||||
|
||||
node->addInputShape(array->shapeInfo());
|
||||
node->addOutputShape(array->shapeInfo());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// do nothing, getZ result will do the trick
|
||||
return static_cast<int>(ctx.width());
|
||||
} else {
|
||||
|
@ -192,6 +208,10 @@ namespace nd4j {
|
|||
auto inputTime = std::chrono::duration_cast<std::chrono::nanoseconds>(inputEnd - inputStart).count();
|
||||
node->setInputTime(inputTime);
|
||||
|
||||
// saving output shapes in profile
|
||||
for (int e = 0; e < inSha.size(); e++)
|
||||
node->addInputShape(inSha.at(e));
|
||||
|
||||
shapeStart = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
|
@ -204,6 +224,10 @@ namespace nd4j {
|
|||
auto prepTime = std::chrono::duration_cast<std::chrono::nanoseconds>(shapeEnd - shapeStart).count();
|
||||
node->setShapeFunctionTime(prepTime);
|
||||
|
||||
// saving output shapes in profile
|
||||
for (int e = 0; e < outSha->size(); e++)
|
||||
node->addOutputShape(outSha->at(e));
|
||||
|
||||
arrayStart = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
|
@ -562,7 +586,7 @@ namespace nd4j {
|
|||
block->setInnerTime(outerTime);
|
||||
}
|
||||
|
||||
if (Environment::getInstance()->isProfiling()) {
|
||||
if (Environment::getInstance()->isProfiling() && !block->isFastPath()) {
|
||||
auto fp = block->getVariableSpace()->flowPath();
|
||||
if (fp != nullptr) {
|
||||
auto p = fp->profile();
|
||||
|
|
|
@ -23,11 +23,11 @@
|
|||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) {
|
||||
LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) {
|
||||
_numInputs = numInputs;
|
||||
}
|
||||
|
||||
LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", true) {
|
||||
LegacyOp::LegacyOp(int numInputs, int opNum) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) {
|
||||
_opNum = opNum;
|
||||
_numInputs = numInputs;
|
||||
}
|
||||
|
|
|
@ -25,11 +25,11 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
LegacyPairwiseTransformOp::LegacyPairwiseTransformOp() : LegacyOp::LegacyOp(2) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyPairwiseTransformOp::LegacyPairwiseTransformOp(int opNum) : LegacyOp::LegacyOp(2, opNum) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyOp* LegacyPairwiseTransformOp::clone() {
|
||||
|
|
|
@ -26,11 +26,11 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
LegacyScalarOp::LegacyScalarOp() : LegacyOp::LegacyOp(1) {
|
||||
// no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyScalarOp::LegacyScalarOp(int opNum) : LegacyOp::LegacyOp(1, opNum){
|
||||
// no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyOp* LegacyScalarOp::clone() {
|
||||
|
@ -66,6 +66,7 @@ namespace nd4j {
|
|||
|
||||
NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y->buffer(), y->shapeInfo(), y->specialBuffer(), y->specialShapeInfo(), extras.argumentsAsT(z->dataType()));
|
||||
|
||||
NDArray::registerSpecialUse({z}, {x, y});
|
||||
} else if (block.getTArguments()->size() > 0) {
|
||||
auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
|
||||
|
||||
|
@ -78,10 +79,9 @@ namespace nd4j {
|
|||
NDArray::prepareSpecialUse({z}, {x, _scalar});
|
||||
|
||||
NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), _scalar->buffer(), _scalar->shapeInfo(), _scalar->specialBuffer(), _scalar->specialShapeInfo(), extras.argumentsAsT(z->dataType()));
|
||||
}
|
||||
|
||||
manager.synchronize();
|
||||
STORE_RESULT(*z);
|
||||
NDArray::registerSpecialUse({z}, {x, _scalar});
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -26,11 +26,11 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
LegacyTransformSameOp::LegacyTransformSameOp() : LegacyOp::LegacyOp(1) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyTransformSameOp::LegacyTransformSameOp(int opNum) : LegacyOp::LegacyOp(1, opNum) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyOp* LegacyTransformSameOp::clone() {
|
||||
|
|
|
@ -26,11 +26,11 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
LegacyTransformStrictOp::LegacyTransformStrictOp() : LegacyOp::LegacyOp(1) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyTransformStrictOp::LegacyTransformStrictOp(int opNum) : LegacyOp::LegacyOp(1, opNum) {
|
||||
// just a no-op
|
||||
this->getOpDescriptor()->allowInplace(true);
|
||||
}
|
||||
|
||||
LegacyOp* LegacyTransformStrictOp::clone() {
|
||||
|
|
|
@ -50,6 +50,9 @@ namespace nd4j {
|
|||
_scalar = isScalar;
|
||||
}
|
||||
|
||||
void OpDescriptor::allowInplace(bool reallyAllow){
|
||||
_allowsInplace = reallyAllow;
|
||||
}
|
||||
|
||||
bool OpDescriptor::operator==(const OpDescriptor& other) const {
|
||||
if (_hash == -1 && other._hash == -1)
|
||||
|
|
|
@ -52,7 +52,7 @@ elseif(WIN32)
|
|||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
|
||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
|
||||
|
|
|
@ -3087,6 +3087,10 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_3) {
|
|||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
NDArray x = NDArrayFactory::create<float>('c', {2,4,5,3});
|
||||
NDArray exp = NDArrayFactory::create<float>('c', {2,4,5,3},{
|
||||
|
|
|
@ -78,6 +78,11 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_1) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
auto x = NDArrayFactory::create<double>('c', {5}, {1, 2, 3, std::numeric_limits<double>::infinity(), 5});
|
||||
auto y = NDArrayFactory::create<double>('c', {5}, {1, 2, 3, -std::numeric_limits<double>::infinity(), 5});
|
||||
|
||||
|
@ -332,6 +337,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_max_1) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
auto e = NDArrayFactory::create<float>('c', {1, 0});
|
||||
nd4j::ops::reduce_sum sumOp;
|
||||
|
@ -343,6 +352,10 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests14, test_empty_reduce_mean_1) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
auto e = NDArrayFactory::create<float>('c', {1, 0});
|
||||
nd4j::ops::reduce_mean sumOp;
|
||||
|
|
|
@ -584,6 +584,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_1) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests15, test_check_numeric_2) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {3},{1.f, 2.f, std::numeric_limits<float>::infinity()});
|
||||
auto y = NDArrayFactory::string("should trigger");
|
||||
auto z = NDArrayFactory::create<float>('c', {3} );
|
||||
|
@ -598,6 +603,11 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_2) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests15, test_check_numeric_3) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {3},{1.f, 2.f, std::numeric_limits<float>::quiet_NaN()});
|
||||
auto y = NDArrayFactory::string("should trigger");
|
||||
auto z = NDArrayFactory::create<float>('c', {3} );
|
||||
|
@ -1530,6 +1540,10 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test10) {
|
|||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
|
||||
#ifdef FFAST_MATH
|
||||
if (1 > 0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32);
|
||||
NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32);
|
||||
|
|
|
@ -65,6 +65,110 @@ TEST_F(PlaygroundTests, test_avx) {
|
|||
nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(PlaygroundTests, test_bert_1) {
|
||||
// this test will run ONLY if this model exists
|
||||
if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0)
|
||||
return;
|
||||
|
||||
auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb");
|
||||
|
||||
auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext.numpy");
|
||||
auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_1.numpy");
|
||||
auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_4.numpy");
|
||||
auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model_output.numpy");
|
||||
|
||||
//graph->printOut();
|
||||
|
||||
graph->tagInplaceNodes();
|
||||
|
||||
graph->getVariableSpace()->putVariable(85,0, t);
|
||||
graph->getVariableSpace()->putVariable(86,0, u);
|
||||
graph->getVariableSpace()->putVariable(87,0, v);
|
||||
|
||||
/*
|
||||
// validating graph now
|
||||
auto status = GraphExecutioner::execute(graph);
|
||||
ASSERT_EQ(Status::OK(), status);
|
||||
ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198));
|
||||
|
||||
auto array = graph->getVariableSpace()->getVariable(198)->getNDArray();
|
||||
ASSERT_EQ(z, *array);
|
||||
*/
|
||||
|
||||
nd4j::Environment::getInstance()->setProfiling(true);
|
||||
auto profile = GraphProfilingHelper::profile(graph, 1);
|
||||
|
||||
profile->printOut();
|
||||
|
||||
nd4j::Environment::getInstance()->setProfiling(false);
|
||||
delete profile;
|
||||
|
||||
/*
|
||||
std::vector<Nd4jLong> values;
|
||||
|
||||
for (int e = 0; e < 1; e++) {
|
||||
auto timeStart = std::chrono::system_clock::now();
|
||||
|
||||
GraphExecutioner::execute(graph);
|
||||
|
||||
auto timeEnd = std::chrono::system_clock::now();
|
||||
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
|
||||
values.emplace_back(outerTime);
|
||||
}
|
||||
|
||||
std::sort(values.begin(), values.end());
|
||||
|
||||
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
|
||||
*/
|
||||
|
||||
delete graph;
|
||||
}
|
||||
|
||||
/*
|
||||
TEST_F(PlaygroundTests, test_broadcast_1) {
|
||||
int pool = 10;
|
||||
std::vector<NDArray*> aX(pool);
|
||||
std::vector<NDArray*> aY(pool);
|
||||
std::vector<NDArray*> aZ(pool);
|
||||
|
||||
for (int e = 0; e < pool; e++) {
|
||||
aX[e] = NDArrayFactory::create_<float>('c', {64, 128, 1});
|
||||
aY[e] = NDArrayFactory::create_<float>('c', {768});
|
||||
aZ[e] = NDArrayFactory::create_<float>('c', {64, 128, 768});
|
||||
|
||||
aX[e]->assign(119 * (e+1));
|
||||
aY[e]->assign(119 * (e+3));
|
||||
}
|
||||
|
||||
std::vector<Nd4jLong> values;
|
||||
|
||||
for (int e = 0; e < 1000; e++) {
|
||||
auto x = aX[e < pool ? e : e % pool];
|
||||
auto y = aY[e < pool ? e : e % pool];
|
||||
auto z = aZ[e < pool ? e : e % pool];
|
||||
|
||||
auto timeStart = std::chrono::system_clock::now();
|
||||
|
||||
x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z);
|
||||
|
||||
auto timeEnd = std::chrono::system_clock::now();
|
||||
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
|
||||
values.emplace_back(outerTime);
|
||||
}
|
||||
|
||||
std::sort(values.begin(), values.end());
|
||||
|
||||
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
|
||||
|
||||
for (int e = 0; e < pool; e++) {
|
||||
delete aX[e];
|
||||
delete aY[e];
|
||||
delete aZ[e];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
TEST_F(PlaygroundTests, test_s_0) {
|
||||
|
|
Loading…
Reference in New Issue