cavis/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp

2378 lines
86 KiB
C++

/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// Created by raver119 on 20.11.17.
//
#include "testlayers.h"
#include <Graph.h>
#include <chrono>
#include <Node.h>
#include <ops/declarable/CustomOperations.h>
#include <graph/profiling/GraphProfilingHelper.h>
#include <type_conversions.h>
#include <helpers/threshold.h>
#include <helpers/MmulHelper.h>
#include <ops/ops.h>
#include <OmpLaunchHelper.h>
#include <GradCheck.h>
#include <ops/declarable/helpers/im2col.h>
#include <Loops.h>
#include <helpers/BenchmarkHelper.h>
#include <ops/declarable/helpers/scatter.h>
#include <helpers/ConstantShapeHelper.h>
#include <helpers/ConstantTadHelper.h>
#include <array>
using namespace nd4j;
using namespace nd4j::graph;
class PlaygroundTests : public testing::Test {
public:
int numIterations = 3;
int poolSize = 10;
PlaygroundTests() {
printf("\n");
fflush(stdout);
}
};
/*
TEST_F(PlaygroundTests, LSTMBenchmarks_DebugTNS) {
BenchmarkHelper helper(5,10);
PredefinedParameters mb("mb", {1, 8, 64});
PredefinedParameters nInOut("nInOut", {32, 256, 1024});
ParametersBatch batch({&mb, &nInOut});
nd4j::ops::lstmBlock lstmBlock;
DeclarableBenchmark benchmark(lstmBlock, "lstm");
int seqLength = 64;
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int m = p.getIntParam("mb");
int n = p.getIntParam("nInOut");
Nd4jLong l = 0;
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l)); //Max TS length (unused)
//TNS format
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //x
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //i
ctx->setOutputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //c
ctx->setOutputArray(2, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //f
ctx->setOutputArray(3, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //o
ctx->setOutputArray(4, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //z
ctx->setOutputArray(5, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //h
ctx->setOutputArray(6, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //y
auto cLast = NDArrayFactory::create_<float>('c', {m, n});
auto yLast = NDArrayFactory::create_<float>('c', {m, n});
auto W = NDArrayFactory::create_<float>('c', {2 * n, 4 * n});
auto Wci = NDArrayFactory::create_<float>('c', {n});
auto Wcf = NDArrayFactory::create_<float>('c', {n});
auto Wco = NDArrayFactory::create_<float>('c', {n});
auto b = NDArrayFactory::create_<float>('c', {4 * n});
ctx->setInputArray(2, cLast);
ctx->setInputArray(3, yLast);
ctx->setInputArray(4, W);
ctx->setInputArray(5, Wci);
ctx->setInputArray(6, Wcf);
ctx->setInputArray(7, Wco);
ctx->setInputArray(8, b);
Nd4jLong *iargs = new Nd4jLong[2];
iargs[0] = 0; //No peephole
iargs[1] = 0; //TNS
ctx->setIArguments(iargs, 2);
delete[] iargs;
double *targs = new double[2];
targs[0] = 1.0; //forget bias
targs[1] = 0.0; //cell clipping value
ctx->setTArguments(targs, 2);
delete[] targs;
return ctx;
};
helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
}
TEST_F(PlaygroundTests, BroadcastOps2d) {
BenchmarkHelper helper;
PredefinedParameters rows("rows", {1024, 1048576});
IntPowerParameters cols("cols", 2, 2, 10, 2); //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
BoolParameters axis("axis");
BoolParameters inplace("inplace");
ParametersBatch batch({&rows, &cols, &axis, &inplace});
auto generator = PARAMETRIC_D() {
nd4j_printf("Entered generator\n","");
auto a = p.getIntParam("axis");
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
nd4j_printf("Created first array: [%lld, %lld]\n",arr->sizeAt(0), arr->sizeAt(1));
auto ctx = new Context(1);
ctx->setInputArray(0, arr, true);
if(a == 0){
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), 1}), true);
nd4j_printf("Created second array (a=0): [%lld, %lld]\n",ctx->getNDArray(1)->sizeAt(0), ctx->getNDArray(1)->sizeAt(1));
} else {
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
nd4j_printf("Created second array (a=1): [%lld, %lld]\n",ctx->getNDArray(1)->sizeAt(0), ctx->getNDArray(1)->sizeAt(1));
}
if (p.getIntParam("inplace") == 1) {
ctx->setOutputArray(0, arr, false);
ctx->markInplace(true);
nd4j_printf("Set result array (inplace)\n","");
} else {
auto out = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
ctx->setOutputArray(0, out, true);
nd4j_printf("Created and set result array (not inplace): [%lld, %lld]\n",out->sizeAt(0), out->sizeAt(1));
}
return ctx;
};
std::string s("add");
nd4j::ops::add op;
DeclarableBenchmark benchmark(op, "add");
nd4j_printf("About to execute\n","");
helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
}
*/
TEST_F(PlaygroundTests, test_small_reductions) {
auto f = NDArrayFactory::create<float>('c', {1024 ,1024});
f.assign(1.0f);
int iterations = 1;
std::vector<Nd4jLong> results(iterations);
Nd4jLong mean = 0L;
Nd4jLong max = 0L;
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
for (int e = 0; e < iterations; e++) {
auto x = NDArrayFactory::create<float>('c', {4, 64});
auto z = NDArrayFactory::create<float>('c', {64});
x.assign(1.0f);
int axis = 0;
auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), axis);
auto timeStart = std::chrono::system_clock::now();
NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext ::defaultContext(), reduce::Mean, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &axis, 1, tadPack.primaryShapeInfo(), tadPack.primaryOffsets());
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
results[e] = duration;
mean += duration;
if (duration > max)
max = duration;
if (duration < min)
min = duration;
}
mean /= iterations;
std::sort(results.begin(), results.end());
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
}
TEST_F(PlaygroundTests, Test_PermutedArray_Operation_1) {
auto x = NDArrayFactory::create<float>('c',{64, 32, 4, 32});
auto z = NDArrayFactory::create<float>('c', {4, 64, 32, 32});
x.assign(1.0f);
x.permutei({2, 0, 3, 1});
//x.printShapeInfo("x");
int iterations = 1;
std::vector<Nd4jLong> results(iterations);
Nd4jLong mean = 0L;
Nd4jLong max = 0L;
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
for (int e = 0; e < iterations; e++) {
auto timeStart = std::chrono::system_clock::now();
NativeOpExecutioner::execTransformStrict(LaunchContext::defaultContext(), transform::StrictOps::Sin, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr, nullptr);
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
results[e] = duration;
mean += duration;
if (duration > max)
max = duration;
if (duration < min)
min = duration;
}
mean /= iterations;
std::sort(results.begin(), results.end());
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
}
TEST_F(PlaygroundTests, Test_PermutedArray_Operation_2) {
//x.printShapeInfo("x");
int iterations = 100;
std::vector<Nd4jLong> results(iterations);
Nd4jLong mean = 0L;
Nd4jLong max = 0L;
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
for (int e = 0; e < iterations; e++) {
Nd4jLong eShapeInfo[] = {2, 8, 256, 256, 1, 8192, 1, 99};
Nd4jLong xShapeInfo[] = {2, 8, 256, 1024, 1, 8192, 0, 99};
Nd4jLong yShapeInfo[] = {2, 8, 256, 256, 1, 8192, 1, 99};
float xBuff[8*1024];
NDArray x(xBuff, xShapeInfo);
//NDArray x(eShapeInfo, nd4j::DataType::FLOAT32, true);
NDArray z(yShapeInfo, nd4j::DataType::FLOAT32, true);
x.linspace(0.1f, 0.01f);
auto timeStart = std::chrono::system_clock::now();
NativeOpExecutioner::execTransformStrict(LaunchContext::defaultContext(), transform::StrictOps::Tanh, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr, nullptr);
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
results[e] = duration;
mean += duration;
if (duration > max)
max = duration;
if (duration < min)
min = duration;
}
mean /= iterations;
std::sort(results.begin(), results.end());
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
}
TEST_F(PlaygroundTests, test_reduce_3) {
// auto x = NDArrayFactory::create<float>('c', {4096, 8192});
// auto y = NDArrayFactory::create<float>('c', {8192});
// auto z = NDArrayFactory::create<float>('c', {4096});
auto x = NDArrayFactory::create<float>('c', {16, 32});
auto y = NDArrayFactory::create<float>('c', {32});
auto z = NDArrayFactory::create<float>('c', {16});
auto dim = NDArrayFactory::create<int>('c', {1}, {1});
auto iterations = 100;
std::vector<Nd4jLong> results(iterations);
Nd4jLong mean = 0L;
Nd4jLong max = 0L;
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
NativeOps nativeOps;
for (int e = 0; e < iterations; e++) {
auto timeStart = std::chrono::system_clock::now();
nativeOps.execReduce3(nullptr, reduce3::CosineDistance, x.buffer(), x.shapeInfo(), x.specialBuffer(),
x.specialShapeInfo(), nullptr, y.buffer(), y.shapeInfo(), y.specialBuffer(),
y.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
dim.buffer(), dim.shapeInfo(), dim.specialBuffer(), dim.specialShapeInfo(), nullptr,
nullptr, nullptr, nullptr);
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)).count();
results[e] = duration;
mean += duration;
if (duration > max)
max = duration;
if (duration < min)
min = duration;
}
mean /= iterations;
std::sort(results.begin(), results.end());
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
}
/*
TEST_F(PlaygroundTests, Test_OpBenchmark_1) {
BenchmarkHelper helper;
ScalarBenchmark sb1(scalar::Add, "add", NDArrayFactory::create_<float>('c', {100, 100}), NDArrayFactory::create_<float>(1.0f), NDArrayFactory::create_<float>('c', {100, 100}));
ScalarBenchmark sb2(scalar::Add, "add", NDArrayFactory::create_<float>('c', {1000, 1000}), NDArrayFactory::create_<float>(1.0f), NDArrayFactory::create_<float>('c', {1000, 1000}));
helper.runOperationSuit({&sb1, &sb2}, "ScalarAdd");
}
TEST_F(PlaygroundTests, Test_OpBenchmark_2) {
BenchmarkHelper helper;
Parameters parameters;
parameters.addBoolParam("fOrder", true);
float scalar = 2.0f;
auto fa = NDArrayFactory::create<int>(1);
ScalarBenchmark sb(scalar::Multiply);
// Y will be shared
sb.setY(NDArrayFactory::create_<float>(scalar));
auto generator = GENERATE_XZ() {
// operands go together line by line
x.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
z.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
x.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
z.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
// only share within single op call. do not cross share
auto shared = NDArrayFactory::create_<float>('c', {256, 768});
x.push_back(shared);
z.push_back(shared);
// using bool param here
if (parameters.getBoolParam("fOrder")) {
x.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
z.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
}
//another way to call inplace op
x.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
z.push_back(nullptr);
};
helper.runOperationSuit(&sb, generator, "ScalarTest");
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
// we can use the same generator, since the same number of operands used
helper.runOperationSuit(&tb, generator, "TransformTest");
PairwiseBenchmark pb(pairwise::Pow, "pow test");
auto generatorXYZ = GENERATE_XYZ() {
x.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
y.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
z.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
x.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
y.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
z.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
};
helper.runOperationSuit(&pb, generatorXYZ, "PairwiseTest");
auto generatorReductionAxis = GENERATE_XYZ() {
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
// axis goes to y here
y.push_back(NDArrayFactory::create_<int>(0));
z.push_back(NDArrayFactory::create_<float>('c', {1000}));
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
y.push_back(NDArrayFactory::create_<int>(1));
z.push_back(NDArrayFactory::create_<float>('c', {100}));
// scalar case
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rb(reduce::FloatOps::Mean);
helper.runOperationSuit(&rb, (const std::function<void (ResultSet &, ResultSet &, ResultSet &)>)(generatorReductionAxis), "ReductionAlongDimensionTest");
}
TEST_F(PlaygroundTests, Test_OpBenchmark_3) {
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
PredefinedParameters a("alpha", {2, 3, 4});
PredefinedParameters b("beta", {9, 15, 27});
ParametersBatch batch({&a, &b});
auto parameters = batch.parameters();
ASSERT_EQ(9, parameters.size());
auto params_0 = parameters[0];
ASSERT_EQ(2, params_0.getIntParam("alpha"));
ASSERT_EQ(9, params_0.getIntParam("beta"));
auto params_1 = parameters[1];
ASSERT_EQ(2, params_1.getIntParam("alpha"));
ASSERT_EQ(15, params_1.getIntParam("beta"));
auto params_3 = parameters[3];
ASSERT_EQ(3, params_3.getIntParam("alpha"));
ASSERT_EQ(9, params_3.getIntParam("beta"));
}
TEST_F(PlaygroundTests, Test_OpBenchmark_4) {
BenchmarkHelper helper;
PairwiseBenchmark pb(pairwise::Ops::Add, "PWT ADD");
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
ScalarBenchmark sb(scalar::Multiply);
sb.setY(NDArrayFactory::create_<float>(119.0f));
PredefinedParameters a("alpha", {2, 3, 4});
PredefinedParameters b("beta", {9, 15, 27});
ParametersBatch batch({&a, &b});
auto generator = PARAMETRIC_XZ() {
// operands go together line by line
x.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha") , p.getIntParam("beta")}));
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
};
auto generatorXYZ = PARAMETRIC_XYZ() {
// operands go together line by line
x.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha") , p.getIntParam("beta")}));
y.push_back(NDArrayFactory::create_<float>('f', {p.getIntParam("alpha") , p.getIntParam("beta")}));
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
};
helper.runOperationSuit(&tb, generator, batch, "TransformTanh");
helper.runOperationSuit(&sb, generator, batch, "ScalarMultiply");
helper.runOperationSuit(&pb, generatorXYZ, batch, "PairwiseAdd");
}
TEST_F(PlaygroundTests, Test_OpBenchmark_5) {
BenchmarkHelper helper;
TransformBenchmark tb(transform::StrictOps::Sigmoid, "sigmoid");
IntParameters length("length", 100, 500, 100);
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
// operands go together line by line
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
helper.runOperationSuit(&tb, generator, batch, "Transform_Sigmoid");
}
TEST_F(PlaygroundTests, Test_Something_5) {
auto x = NDArrayFactory::create<float>('c', {100, 10});
auto y = NDArrayFactory::create<float>('c', {10});
auto z = NDArrayFactory::create<float>('c', {100, 10});
std::vector<int> axis = {1};
NativeOpExcutioner::execBroadcast(broadcast::Add, x.buffer(), x.shapeInfo(), y.buffer(), y.shapeInfo(), z.buffer(), z.shapeInfo(),
axis.data(), axis.size(), nullptr, nullptr,
nullptr, nullptr);
}
#define PARAMETRIC_D() [&] (Parameters &p) -> Context*
/*
TEST_F(PlaygroundTests, Test_OpBenchmark_6) {
BenchmarkHelper helper;
nd4j::ops::softmax op;
DeclarableBenchmark db(op, "SoftMaxTest");
PredefinedParameters a("alpha", {128, 256});
PredefinedParameters b("beta", {1024, 2048});
ParametersBatch batch({&a, &b});
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
return ctx;
};
helper.runOperationSuit(&db, generator, batch, "parametrized softmax test");
}
*/
/*
TEST_F(PlaygroundTests, Test_Strided_Stuff) {
auto array = NDArrayFactory::create<float>('c', {1048576, 1024});
auto strided = array({0,0, 3, 4}, true);
auto z = NDArrayFactory::create<float>(0.0f);
//strided->shapeInfo()[shape::shapeInfoLength(strided->rankOf()) - 2] = 1024;
int N = 1000;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < N; e++)
NativeOpExcutioner::execReduceSameScalar(reduce::ReduceSameBenchmarkOp, strided.buffer(), strided.shapeInfo(), nullptr, z.buffer(), z.shapeInfo());
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)).count();
nd4j_printf("average time: %lld us;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
}
*/
/*
TEST_F(PlaygroundTests, StridedReductionsNoEWS) {
nd4j_printf("SETTING ELEMENTWISE THRESHOLD AND TAD THRESHOLD TO 1/1","");
nd4j::Environment::getInstance()->setElementwiseThreshold(1);
nd4j::Environment::getInstance()->setTadThreshold(1);
BenchmarkHelper helper;
IntPowerParameters stride("stride", 2, 0, 10); //2^0=1, ..., 2^10=1024
ParametersBatch batch({&stride});
//This is an edge case: technically an EWS *should* be available here
auto generator1 = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {1048576 + (stride == 1 ? 0 : 1), stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
strided = new NDArray((*arr)({0,1048576, 0,1}, true)); //All rows, first column
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator1), batch, "Strided Sum - No EWS Test 1");
//No EWS defined for this case
auto generator2 = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
strided = new NDArray((*arr)({0,2*1024,2, 0,0,0, 0,1,1}, true, true));
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS");
helper.runOperationSuit(&rbSum2, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator2), batch, "Strided Sum - No EWS Test 2");
}
*/
#ifndef __CUDABLAS__
TEST_F(PlaygroundTests, LambdaTest_1) {
auto array = NDArrayFactory::create<float>('c', {8192, 1024});
array.linspace(1);
auto lambda = LAMBDA_F(_x) {
return _x + 32.12f;
};
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < numIterations; e++) {
array.applyLambda<float>(lambda);
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("Lambda 1 time %lld us\n", outerTime / numIterations);
}
TEST_F(PlaygroundTests, LambdaTest_2) {
auto array = NDArrayFactory::create<float>('c', {8192, 1024});
auto row = NDArrayFactory::create<float>('c', {1, 1024});
array.linspace(1);
auto lambda = LAMBDA_F(_x) {
return _x + 32.12f;
};
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < numIterations; e++) {
array.applyBroadcast(broadcast::Add, {1}, &row);
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("Broadcast time %lld us\n", outerTime / numIterations);
}
TEST_F(PlaygroundTests, NoCacheTest_1) {
std::vector<NDArray*> pool(poolSize);
auto source = NDArrayFactory::create<float>('c', {8192, 1024});
for (int e = 0; e < pool.size(); e++)
pool[e] = source.dup();
auto lambda = LAMBDA_F(_x) {
return _x * 32.12f;
};
auto timeStart = std::chrono::system_clock::now();
int cnt = 0;
for (int e = 0; e < numIterations; e++) {
auto v = pool[poolSize - 1 - (cnt++)];
v->applyLambda<float>(lambda);
if (cnt == poolSize)
cnt = 0;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("Non-cached time %lld us\n", outerTime / numIterations);
for (auto v: pool)
delete v;
}
TEST_F(PlaygroundTests, NoCacheTest_2) {
std::vector<NDArray*> pool1(poolSize);
std::vector<NDArray*> pool2(poolSize);
auto source = NDArrayFactory::create<float>('c', {8192, 1024});
for (int e = 0; e < pool1.size(); e++) {
pool1[e] = source.dup();
pool2[e] = source.dup();
}
auto lambda = LAMBDA_FF(_x, _y) {
return _x * 32.12f + _y;
};
auto timeStart = std::chrono::system_clock::now();
int cnt = 0;
for (int e = 0; e < numIterations; e++) {
auto v1 = pool1[poolSize - 1 - cnt];
auto v2 = pool2[cnt++];
v1->applyPairwiseLambda<float>(v2, lambda);
if (cnt == poolSize)
cnt = 0;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("Non-cached PWT time %lld us\n", outerTime / numIterations);
for (auto v: pool1)
delete v;
for (auto v: pool2)
delete v;
}
#endif
TEST_F(PlaygroundTests, ReductionTest_1) {
std::vector<NDArray*> pool1(poolSize);
std::vector<NDArray*> pool2(poolSize);
auto source = NDArrayFactory::create<float>('c', {1, 100});
for (int e = 0; e < pool1.size(); e++) {
pool1[e] = source.dup();
pool2[e] = source.dup();
}
auto timeStart = std::chrono::system_clock::now();
int cnt = 0;
for (int e = 0; e < 1; e++) {
auto v = pool1[poolSize - 1 - cnt];
auto r = v->sumNumber();
if (cnt == poolSize)
cnt = 0;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
// nd4j_printf("Non-cached reduction time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
for (auto v: pool1)
delete v;
for (auto v: pool2)
delete v;
}
TEST_F(PlaygroundTests, ScalarTest_1) {
std::vector<NDArray*> pool1(poolSize);
std::vector<NDArray*> pool2(poolSize);
auto source = NDArrayFactory::create<float>('c', {1, 100});
for (int e = 0; e < pool1.size(); e++) {
pool1[e] = source.dup();
pool2[e] = source.dup();
}
auto timeStart = std::chrono::system_clock::now();
int cnt = 0;
float *buff = reinterpret_cast<float*>(source.buffer());
for (int e = 0; e < 100; e++) {
//auto v = pool1[poolSize - 1 - cnt];
//v->template applyScalar<simdOps::Add<float>>(2.0f);
source.applyScalar(scalar::Add,2.0f);
//functions::scalar::ScalarTransform<float>::template transformEx<simdOps::Add<float>>(source.buffer(), 1, source.buffer(), 1, 2.0f, nullptr, source.lengthOf());
//functions::scalar::ScalarTransform<float>::template transform<simdOps::Add<float>>(buff, 1, buff, 1, 2.0f, nullptr, 100);
cnt++;
if (cnt == poolSize)
cnt = 0;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
// nd4j_printf("Cached scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000L, outerTimeMs);
for (auto v: pool1)
delete v;
for (auto v: pool2)
delete v;
}
TEST_F(PlaygroundTests, ScalarTest_2) {
std::vector<NDArray*> pool1(poolSize);
std::vector<NDArray*> pool2(poolSize);
auto source = NDArrayFactory::create<float>('c', {1, 100});
for (int e = 0; e < pool1.size(); e++) {
pool1[e] = source.dup();
pool2[e] = source.dup();
}
auto timeStart = std::chrono::system_clock::now();
int cnt = 0;
float * array = reinterpret_cast<float*>(source.buffer());
for (int e = 0; e < 1000; e++) {
PRAGMA_OMP_SIMD
for (int i = 0; i < source.lengthOf(); i++) {
array[i] = simdOps::Add<float, float, float>::op(array[i], 2.0f);
}
cnt++;
if (cnt == poolSize)
cnt = 0;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
// nd4j_printf("Cached manual scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
for (auto v: pool1)
delete v;
for (auto v: pool2)
delete v;
}
TEST_F(PlaygroundTests, Test_Profile_1) {
GraphProfile prof;
prof.setBuildTime(70);
prof.setExecutionTime(130);
prof.startEvent("omega");
prof.spotEvent("alpha");
prof.spotEvent("beta");
prof.spotEvent("gamma");
prof.recordEvent("omega");
auto nodeA = prof.nodeById(1, "MatMul");
auto nodeB = prof.nodeById(2, "Sum");
auto nodeC = prof.nodeById(3, "Conv2D");
nodeA->setObjectsSize(512);
nodeA->setTemporarySize(65536);
nodeA->setActivationsSize(512387);
nodeA->setPreparationTime(127);
nodeA->setExecutionTime(6539);
nodeB->setObjectsSize(0);
nodeB->setTemporarySize(0);
nodeB->setActivationsSize(512387);
nodeB->setPreparationTime(132);
nodeB->setExecutionTime(2047);
nodeC->setObjectsSize(1536);
nodeC->setTemporarySize(2355674);
nodeC->setActivationsSize(1022092);
nodeC->setPreparationTime(129);
nodeC->setExecutionTime(12983);
// prof.printOut();
}
#ifdef GRAPH_FILES_OK
TEST_F(PlaygroundTests, Test_Profile_2) {
Environment::getInstance()->setProfiling(true);
auto graph = GraphExecutioner::importFromFlatBuffers("./resources/ae_00.fb");
auto profile = GraphProfilingHelper::profile(graph, 2);
// profile->printOut();
delete graph;
delete profile;
}
#endif
TEST_F(PlaygroundTests, Test_Im2Col_1) {
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
int oH=55, oW=55;
int iterations = 1;
auto input = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
auto output = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
nd4j::ops::im2col op;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&input}, {&output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// outputPermuted.printShapeInfo("permuted shape");
auto permStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&input}, {&outputPermuted}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto permEnd = std::chrono::system_clock::now();
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
auto legacyStart = std::chrono::system_clock::now();
ExtraArguments extra({(double)kH, (double)kW, (double)sH, (double)sW, (double)pH, (double)pW, (double)dH, (double)dW, (double) 0.f, (double)0.f});
for (int e = 0; e < iterations; e++) {
input.applyTransform(transform::Im2col, &output, &extra);
}
auto legacyEnd = std::chrono::system_clock::now();
auto legacyTime = std::chrono::duration_cast<std::chrono::microseconds> (legacyEnd - legacyStart).count();
auto legacyPermStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
input.applyTransform(transform::Im2col, &outputPermuted, &extra);
}
auto legacyPermEnd = std::chrono::system_clock::now();
auto legacyPermTime = std::chrono::duration_cast<std::chrono::microseconds> (legacyPermEnd - legacyPermStart).count();
NativeOps nativeOps;
Nd4jLong iArgs[] = {kH, kW, sH, sW, pH, pW, dH, dW, 0};
Nd4jPointer inputBuffers[] = {input.buffer()};
Nd4jPointer inputShapes[] = {input.shapeInfo()};
Nd4jPointer outputBuffers[] = {output.buffer()};
Nd4jPointer outputShapes[] = {output.shapeInfo()};
auto javaStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputBuffers, outputShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
}
auto javaEnd = std::chrono::system_clock::now();
auto javaTime = std::chrono::duration_cast<std::chrono::microseconds> (javaEnd - javaStart).count();
Nd4jPointer outputPermBuffers[] = {outputPermuted.buffer()};
Nd4jPointer outputPermShapes[] = {outputPermuted.shapeInfo()};
auto javaPermStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputPermBuffers, outputPermShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
}
auto javaPermEnd = std::chrono::system_clock::now();
auto javaPermTime = std::chrono::duration_cast<std::chrono::microseconds> (javaPermEnd - javaPermStart).count();
// nd4j_printf("New time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
// nd4j_printf("Legacy time: %lld us;\n", legacyTime / iterations);
// nd4j_printf("Legacy Permuted time: %lld us;\n", legacyPermTime / iterations);
// nd4j_printf("Java time: %lld us;\n", javaTime / iterations);
// nd4j_printf("Java Permuted time: %lld us;\n", javaPermTime / iterations);
}
TEST_F(PlaygroundTests, Test_Im2Col_2) {
auto input = NDArrayFactory::create<float>('c', {16, 3, 224, 224});
auto output = NDArrayFactory::create<float>('c', {16, 3, 11, 11, 55, 55});
auto outputPermuted = NDArrayFactory::create<float>('c', {16, 55, 55, 3, 11, 11});
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
nd4j::ops::im2col op;
Nd4jLong iArgs[] = {11, 11, 4, 4, 2, 2, 1, 1, 0};
Nd4jPointer inputBuffers[] = {input.buffer()};
Nd4jPointer inputShapes[] = {input.shapeInfo()};
Nd4jPointer outputPermBuffers[] = {outputPermuted.buffer()};
Nd4jPointer outputPermShapes[] = {outputPermuted.shapeInfo()};
NativeOps nativeOps;
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputPermBuffers, outputPermShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
}
TEST_F(PlaygroundTests, Test_Col2Im_1) {
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
int oH=55, oW=55;
int iterations = 1;
auto input = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
auto output = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
auto inputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
inputPermuted.permutei({0, 3, 4, 5, 1, 2});
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
outputPermuted.permutei({0, 3, 1, 2});
input = 10.;
output = 2.;
inputPermuted = 10.;
outputPermuted = 2.;
nd4j::ops::col2im op;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&input}, {&output}, {}, {sH, sW, pH, pW, iH, iW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
auto permStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&inputPermuted}, {&outputPermuted}, {}, {sH, sW, pH, pW, iH, iW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto permEnd = std::chrono::system_clock::now();
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
}
TEST_F(PlaygroundTests, Test_Im2Col_3) {
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
int oH=55, oW=55;
int iterations = 1;
auto output = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
auto input = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
auto inputPermuted = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
inputPermuted.permutei({0, 3, 1, 2});
input = 10.;
output = 2.;
inputPermuted = 10.;
outputPermuted = 2.;
nd4j::ops::im2col op;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&input}, {&output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
auto permStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&inputPermuted}, {&outputPermuted}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
ASSERT_EQ(Status::OK(), result);
}
auto permEnd = std::chrono::system_clock::now();
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
}
TEST_F(PlaygroundTests, loop_test_1) {
if (1>0)
return;
auto f = NDArrayFactory::create<float>('c', {2}, {5000, 10000});
nd4j::ops::randomuniform op;
auto result = op.execute({&f}, {-1.0f, 1.0f}, {});
ASSERT_EQ(Status::OK(), result->status());
auto array = result->at(0);
auto buffer = array->buffer();
int cnt = 0;
int iterations = 1;
//nd4j_printf("Array length: %lld\n", array->lengthOf());
int length = (int) array->lengthOf();
int span = (int) (array->lengthOf() / 6) + 8;
NativeOps ops;
auto t = new int[1000000];
FloatBits fb;
float threshold = 0.99f;
fb.f_ = threshold;
int le = ops.estimateThreshold(nullptr, reinterpret_cast<void *>(array->buffer()), array->shapeInfo(), static_cast<int>(array->lengthOf()), threshold);
t[0] = le;
t[1] = length;
t[2] = fb.i_;
//nd4j_printf("number of elements: [%i]\n", le);
long permTime = 0;
for (int x = 0; x < iterations; x++) {
auto permStart = std::chrono::system_clock::now();
ops.estimateThreshold(nullptr, reinterpret_cast<void *>(array->buffer()), array->shapeInfo(), static_cast<int>(array->lengthOf()), threshold);
TypeCast::convertToThreshold<float>(nullptr, buffer, array->lengthOf(), t);
auto permEnd = std::chrono::system_clock::now();
permTime += std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
}
nd4j_printf("Permuted time: %lld us; Counter: %i;\n", permTime / iterations, cnt);
delete result;
delete[] t;
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, ndarray_tile_test1) {
auto x = NDArrayFactory::create<float>('c', {20, 30});
auto exp = NDArrayFactory::create<float>('c', {2,40,60});
auto timeStart = std::chrono::system_clock::now();
auto tiled = x.tile({2,2,2});
auto timeEnd = std::chrono::system_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("c-order time: %d;\n", time);
ASSERT_TRUE(tiled.isSameShape(&exp));
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, ndarray_tile_test2) {
auto x = NDArrayFactory::create<float>('f', {20, 30});
auto exp = NDArrayFactory::create<float>('f', {2,40,60});
auto timeStart = std::chrono::system_clock::now();
auto tiled = x.tile({2,2,2});
auto timeEnd = std::chrono::system_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
// nd4j_printf("f-order time: %d;\n", time);
ASSERT_TRUE(tiled.isSameShape(&exp));
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loopThroughArrs_test1) {
NDArray x('c', {20, 30, 40}, nd4j::DataType::DOUBLE);
NDArray y('f', {50, 30, 4, 4}, nd4j::DataType::DOUBLE);
auto xBuff = x.bufferAsT<double>();
auto yBuff = y.bufferAsT<double>();
auto len = x.lengthOf();
//***********************************
//***********************************
auto timeStart = std::chrono::system_clock::now();
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
for(Nd4jLong i = 0; i < len; ++i) {
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
Nd4jLong offset2 = shape::getIndexOffset(i, y.getShapeInfo(), len);
xBuff[offset1] = yBuff[offset2];
}
auto timeEnd = std::chrono::system_clock::now();
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
//***********************************
//***********************************
timeStart = std::chrono::system_clock::now();
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
for(Nd4jLong i = 0; i < len; ++i) {
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
Nd4jLong offset2 = shape::getIndexOffset(i, y.getShapeInfo(), len);
xBuff[offset1] = yBuff[offset2];
}
timeEnd = std::chrono::system_clock::now();
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
nd4j_printf("My time: %lld us;\n", myTime);
nd4j_printf("Old time: %lld us;\n", oldTime);
ASSERT_TRUE(1);
}
static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
auto len = shape::length(xShapeInfo);
int xEws = shape::elementWiseStride(xShapeInfo);
int yEws = shape::elementWiseStride(yShapeInfo);
int zEws = shape::elementWiseStride(zShapeInfo);
BlockInformation info(len, ELEMENT_THRESHOLD);
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info.threads) if (info.threads > 1))
{
auto i = omp_get_thread_num();
Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder;
Nd4jLong index = i * info.items;
auto xi = x + xEws * index;
auto yi = y + yEws * index;
auto zi = z + zEws * index;
PRAGMA_OMP_SIMD
for (Nd4jLong j = 0; j < itemsToLoop; j++)
zi[j * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(xi[j * xEws], yi[j * yEws]);
}
}
static void loopSimple(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
auto len = shape::length(xShapeInfo);
int xEws = shape::elementWiseStride(xShapeInfo);
int yEws = shape::elementWiseStride(yShapeInfo);
int zEws = shape::elementWiseStride(zShapeInfo);
int threads = 6;
int span_size = len / threads + 1;
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close))
for(Nd4jLong i = 0; i < len; ++i)
z[i * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(x[i * xEws], y[i * yEws]);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loopThroughArrs_test2) {
NDArray x('c', {40, 25}, nd4j::DataType::FLOAT32);
const int iterations = 1;
const int arrays = 10;
std::vector<NDArray> arrs(arrays);
for(auto& arr : arrs)
arr = x;
//***********************************
auto timeStart = std::chrono::system_clock::now();
srand(119);
for(Nd4jLong i = 0; i < iterations; ++i) {
int xInd = rand() % arrays;
int yInd = rand() % arrays;
int zInd = rand() % arrays;
auto xBuff = arrs[xInd].bufferAsT<float>();
auto yBuff = arrs[yInd].bufferAsT<float>();
auto zBuff = arrs[zInd].bufferAsT<float>();
auto xShapeInfo = arrs[xInd].getShapeInfo();
auto yShapeInfo = arrs[yInd].getShapeInfo();
auto zShapeInfo = arrs[zInd].getShapeInfo();
loopSimple(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
}
auto timeEnd = std::chrono::system_clock::now();
auto simpleTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
//***********************************
timeStart = std::chrono::system_clock::now();
for(Nd4jLong i = 0; i < iterations; ++i) {
int xInd = rand() % arrays;
int yInd = rand() % arrays;
int zInd = rand() % arrays;
auto xBuff = arrs[xInd].bufferAsT<float>();
auto yBuff = arrs[yInd].bufferAsT<float>();
auto zBuff = arrs[zInd].bufferAsT<float>();
auto xShapeInfo = arrs[xInd].getShapeInfo();
auto yShapeInfo = arrs[yInd].getShapeInfo();
auto zShapeInfo = arrs[zInd].getShapeInfo();
loopSpan(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
}
timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
nd4j_printf("simple time: %lld us;\n", simpleTime);
nd4j_printf("span time: %lld us;\n", spanTime);
ASSERT_TRUE(1);
}
static void loop1(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
auto len = shape::length(xShapeInfo);
int xEws = shape::elementWiseStride(xShapeInfo);
int yEws = shape::elementWiseStride(yShapeInfo);
int zEws = shape::elementWiseStride(zShapeInfo);
nd4j::OmpLaunchHelper info(len);
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info._numThreads))
{
auto threadNum = omp_get_thread_num();
Nd4jLong threadOffset = info.getThreadOffset(threadNum);
PRAGMA_OMP_SIMD
for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++) {
Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len);
Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len);
Nd4jLong zOffset = shape::getIndexOffset(j+threadOffset, zShapeInfo, len);
z[xOffset] = simdOps::LogPoissonLoss<float, float, float>::op(x[xOffset], y[xOffset]);
}
}
}
static void loop2(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
auto len = shape::length(xShapeInfo);
int xEws = shape::elementWiseStride(xShapeInfo);
int yEws = shape::elementWiseStride(yShapeInfo);
int zEws = shape::elementWiseStride(zShapeInfo);
int threads = 6;
int span_size = len / threads + 1;
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
for(Nd4jLong i = 0; i < len; ++i) {
Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len);
Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len);
Nd4jLong zOffset = shape::getIndexOffset(i, zShapeInfo, len);
z[xOffset] = simdOps::LogPoissonLoss<float, float, float>::op(x[xOffset], y[xOffset]);
}
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loopThroughArrs_test3) {
NDArray x('c', {50, 250}, nd4j::DataType::FLOAT32);
const int iterations = 1;
const int arrays = 100;
std::vector<NDArray> arrs(arrays);
for(auto& arr : arrs)
arr = x;
//***********************************
auto timeStart = std::chrono::system_clock::now();
srand(119);
for(Nd4jLong i = 0; i < iterations; ++i) {
int xInd = rand() % arrays;
int yInd = rand() % arrays;
int zInd = rand() % arrays;
auto xBuff = arrs[xInd].bufferAsT<float>();
auto yBuff = arrs[yInd].bufferAsT<float>();
auto zBuff = arrs[zInd].bufferAsT<float>();
auto xShapeInfo = arrs[xInd].getShapeInfo();
auto yShapeInfo = arrs[yInd].getShapeInfo();
auto zShapeInfo = arrs[zInd].getShapeInfo();
loop2(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
}
auto timeEnd = std::chrono::system_clock::now();
auto simpleTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
//***********************************
timeStart = std::chrono::system_clock::now();
for(Nd4jLong i = 0; i < iterations; ++i) {
int xInd = rand() % arrays;
int yInd = rand() % arrays;
int zInd = rand() % arrays;
auto xBuff = arrs[xInd].bufferAsT<float>();
auto yBuff = arrs[yInd].bufferAsT<float>();
auto zBuff = arrs[zInd].bufferAsT<float>();
auto xShapeInfo = arrs[xInd].getShapeInfo();
auto yShapeInfo = arrs[yInd].getShapeInfo();
auto zShapeInfo = arrs[zInd].getShapeInfo();
loop1(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
}
timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
nd4j_printf("simpleTime time: %lld us;\n", simpleTime);
nd4j_printf("spanTime time: %lld us;\n", spanTime);
ASSERT_TRUE(1);
}
TEST_F(PlaygroundTests, test_batched_skipgram_1) {
const int batchSize = 64;
const int codeLen = 6;
const int numWords = 244;
const int vectorLength = 50;
auto target = NDArrayFactory::create<int>('c', {batchSize});
auto ngStarter = NDArrayFactory::empty<int>();
auto indices = NDArrayFactory::create<int>('c', {batchSize, codeLen});
auto codes = NDArrayFactory::create<int8_t>('c', {batchSize, codeLen});
auto syn0 = NDArrayFactory::create<float>('c', {numWords, vectorLength});
auto syn1 = NDArrayFactory::create<float>('c', {numWords, vectorLength});
auto syn1Neg = NDArrayFactory::empty<float>();
auto expTable = NDArrayFactory::linspace<float>(0.001, 0.995, 10000);
auto negTable = NDArrayFactory::empty<float>();
auto alpha = NDArrayFactory::create<double>('c', {batchSize});
auto randomValue = NDArrayFactory::create<Nd4jLong>('c', {batchSize});
auto inferenceVector = NDArrayFactory::empty<float>();
syn0.assign(0.01);
syn1.assign(0.02);
Nd4jLong rv = 2843242345121L;
auto lr = 0.025;
for (int e = 0; e < batchSize; e++) {
target.p(e, e);
alpha.p(e, lr);
randomValue.p(e, rv);
lr -= 0.001;
for (int s = 1; s < codeLen; s++) {
indices.p(e, s, nd4j::math::nd4j_abs<Nd4jLong>(rv % numWords));
codes.p(e, s, s % 2);
rv = nd4j::math::nd4j_abs<Nd4jLong>(rv * 25214903917L + 11);
}
rv = nd4j::math::nd4j_abs<Nd4jLong>(rv * 25214903917L + 11);
}
//indices.printIndexedBuffer("indices");
//codes.printIndexedBuffer("codes");
auto iterations = 1;
nd4j::ops::skipgram op;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
auto result = op.execute({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, expTable, &negTable, &alpha, &randomValue, &inferenceVector}, {}, {}, {false}, true);
ASSERT_EQ(Status::OK(), result->status());
delete result;
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
nd4j_printf("average time: %lld us;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
delete expTable;
}
TEST_F(PlaygroundTests, test_reduce_scalar_float_1) {
auto array = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
auto target = NDArrayFactory::create<float>(0.0f);
// warm up
for (int e = 0; e < 1; e++) {
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::Mean, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
int iterations = 1;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::Mean, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
nd4j_printf("average time: %lld us;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
}
TEST_F(PlaygroundTests, test_reduce_scalar_float_2) {
auto array = NDArrayFactory::create<float>('c', {100000});
auto target = NDArrayFactory::create<float>(0.0f);
// warm up
for (int e = 0; e < 1; e++) {
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::ReduceFloatBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
int iterations = 1;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::ReduceFloatBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)/iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
nd4j_printf("average time: %lld ns;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
}
TEST_F(PlaygroundTests, test_reduce_scalar_same_2) {
auto array = NDArrayFactory::create<float>('c', {100000});
auto target = NDArrayFactory::create<float>(0.0f);
// warm up
for (int e = 0; e < 1; e++) {
NativeOpExecutioner::execReduceSameScalar(LaunchContext::defaultContext(), reduce::ReduceSameBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
int iterations = 1;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
NativeOpExecutioner::execReduceSameScalar(LaunchContext::defaultContext(), reduce::ReduceSameBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)/iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
nd4j_printf("average time: %lld ns;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
}
TEST_F(PlaygroundTests, test_assign_float) {
// auto array = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
// auto target = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
auto array = NDArrayFactory::create<float>('c', {32, 64, 128, 128});
auto target = NDArrayFactory::create<float>('c', {32, 64, 128, 128});
array.assign(119);
// warm up
for (int e = 0; e < 5; e++) {
NativeOpExecutioner::execTransformAny(LaunchContext::defaultContext(), transform::Assign, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), nullptr, nullptr, nullptr);
}
int iterations = 1;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
NativeOpExecutioner::execTransformAny(LaunchContext::defaultContext(), transform::Assign, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), nullptr, nullptr, nullptr);
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
auto bw = (1000000L * (float) (array.lengthOf() * array.sizeOfT()) / spanTime) / 1024 / 1024 / 1024;
nd4j_printf("average time: %lld us;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
nd4j_printf("Bandwidth: %f GB/s\n", bw)
}
TEST_F(PlaygroundTests, test_hash_1) {
std::vector<int> vec;
for (int e = 1; e < 100000; e++)
vec.emplace_back(e);
int size = vec.size();
int r = 0;
PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:r)
for (int e = 0; e < size; e++) {
r += 31 * vec[e];
}
nd4j_printf("Result: %i\n", r);
}
TEST_F(PlaygroundTests, test_hash_2) {
auto x = NDArrayFactory::create<int>('c', {5, 10000});
x.linspace(1.f);
//auto h = x.reduceNumber(reduce::LongOps::HashCode);
//h.printIndexedBuffer("hash");
}
/*
TEST_F(PlaygroundTests, test_manual_loop) {
const unsigned int len = 32 * 128 * 256 * 256;
auto array = new float[len];
auto z = new float[len];
for (unsigned int e = 0; e < len; e++)
array[e] = (float) e;
const int iterations = 100;
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < iterations; i++) {
PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(4) schedule(static, 32768))
for (unsigned int e = 0; e < len; e++)
z[e] = array[e];
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
auto bw = (1000000L * (float) (len * sizeof(float)) / spanTime) / 1024 / 1024 / 1024;
nd4j_printf("length: %i\n", len);
nd4j_printf("average time: %lld us;\n", spanTime);
nd4j_printf("total time: %lld ms;\n", ttlTime);
nd4j_printf("Bandwidth: %f GB/s\n", bw)
delete[] array;
delete[] z;
}
TEST_F(PlaygroundTests, test_col2im_permuted_1) {
auto x = NDArrayFactory::create<float>('c', {8, 64, 55, 55, 3, 3});
x.assign(1.f);
x.permutei({0, 1, 4, 5, 2, 3});
auto z0 = NDArrayFactory::create<float>('c', {64, 8, 112, 112});
z0.permutei({1, 0, 2, 3});
auto z1 = NDArrayFactory::create<float>('c', {64, 8, 112, 112});
z1.permutei({1, 0, 2, 3});
nd4j_printf("Starting custom run...\n","");
const int iterations = 100;
nd4j::ops::col2im op;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++) {
op.execute({&x}, {&z0}, {}, {2, 2, 0, 0, 112, 112, 1, 1, 1}, {});
}
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
nd4j_printf("Starting legacy run...\n","");
ExtraArguments arguments({2., 2., 0., 0., 112., 112., 1., 1.});
auto legacyStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++)
x.applyTransform(transform::Col2Im, &z1, &arguments);
auto legacyEnd = std::chrono::system_clock::now();
auto legacySpanTime = std::chrono::duration_cast<std::chrono::microseconds> ((legacyEnd - legacyStart) / iterations).count();
auto legacyTtlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((legacyEnd - legacyStart)).count();
nd4j_printf("average time: %lld us vs %lld us;\n", spanTime, legacySpanTime);
nd4j_printf("total time: %lld ms vs %lld ms;\n", ttlTime, legacyTtlTime);
ASSERT_EQ(z0, z1);
}
TEST_F(PlaygroundTests, test_addi_assign) {
int iterations = 1;
auto x = NDArrayFactory::create<float>('c', {1000000000});
auto z = NDArrayFactory::create<float>('c', {1000000000});
x.assign(119.0f);
auto timeStart = std::chrono::system_clock::now();
x.applyScalar(scalar::Add,1.0f, &z, nullptr);
//z.assign(x);
auto timeEnd = std::chrono::system_clock::now();
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
auto bw = (1000000L * (float) (x.lengthOf() * x.sizeOfT()) / spanTime) / 1024 / 1024 / 1024;
nd4j_printf("Avg add(1.0f) time: %lld us\n", spanTime);
nd4j_printf("Bandwidth: %f GB/s\n", bw);
}
/////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, conv2d_1) {
const int N = 100;
int bS=8, iH=64,iW=64, iC=32,oC=32, kH=2,kW=2, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int paddingMode = 1; // 1-SAME, 0-VALID;
int dataFormat = 0; // 1-NHWC, 0-NCHW
NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
NDArray output(input);
NDArray weights('c', {kH, kW, iC, oC}, nd4j::DataType::FLOAT32);
NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
input = 2.;
weights.linspace(0.1, 0.1);
bias = 0.5;
nd4j::ops::conv2d op;
for (int i = 0; i < 10; i++)
100.5*0.5;
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N; i++)
op.execute({&input, &weights, &bias}, {&output} , {}, {kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, dataFormat},{});
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
printf("duration %ld\n", duration);
}
*/
/////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, batchnorm_1) {
const int N = 1;
NDArray input ('c', {8, 32, 64, 64}, nd4j::DataType::FLOAT32);
NDArray output ('c', {8, 32, 64, 64}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {32}, nd4j::DataType::FLOAT32);
NDArray variance('c', {32}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {32}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {32}, nd4j::DataType::FLOAT32);
input = 10.5;
mean = 5.5;
variance = 1.5;
gamma = 0.5;
beta = 2.5;
nd4j::ops::batchnorm_new op;
auto timeStart = std::chrono::system_clock::now();
// for (int i = 0; i <N ; i++)
op.execute({&input, &mean, &variance, &gamma, &beta}, {&output}, {1e-5}, {1,1,1}, {});
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
printf("duration %ld\n", duration);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, softmax_1) {
const int N = 1;
NDArray input('c', {1024, 256}, nd4j::DataType::FLOAT32);
NDArray output('c', {1024, 256}, nd4j::DataType::FLOAT32);
input.linspace(-100., 0.01);
nd4j::ops::softmax op;
for (int i = 0; i < 20 ; i++)
100.5*100.5;
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++)
op.execute({&input}, {&output}, {}, {1}, {});
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
printf("duration %ld\n", duration);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, subarr_1) {
NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
NDArray subArr1 = x({0,0, 3,4});
NDArray subArr2 = x({0,0, 3,4}, true);
subArr1.printShapeInfo("subArr1");
subArr2.printShapeInfo("subArr2");
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, subarr_2) {
NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
subArr1->printShapeInfo("subArr1");
ASSERT_EQ(5, subArr1->ews());
delete subArr1;
}
////////////////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loops_1) {
/*
const int N = 1;
NDArray x('c', {16, 32, 64, 64}, nd4j::DataType::FLOAT32);
NDArray z1('c', {32}, nd4j::DataType::FLOAT32);
NDArray z2('c', {32}, nd4j::DataType::FLOAT32);
NDArray z3('c', {32}, nd4j::DataType::FLOAT32);
std::vector<int> dimsToExclude = {0,2,3};
std::vector<int> tadDims = {1};
x.linspace(0.01);
// warm up
for (int i = 0; i < 1000; ++i)
32*512;
auto timeStart1 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++)
x.reduceAlongDimension(nd4j::reduce::Mean, &z1, dimsToExclude);
auto timeEnd1 = std::chrono::system_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd1 - timeStart1) / N).count();
auto timeStartE = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++)
x.reduceAlongDimension(nd4j::reduce::Sum, &z3, dimsToExclude);
auto timeEndE = std::chrono::system_clock::now();
auto durationE = std::chrono::duration_cast<std::chrono::microseconds> ((timeEndE - timeStartE) / N).count();
Nd4jLong *tadShapeInfo(nullptr), *tadOffsets(nullptr);
x.getSubArrShapeAndOffsets(tadDims, tadShapeInfo, tadOffsets);
// shape::printShapeInfoLinear(tadShapeInfo);
// shape::printIntArray(tadOffsets, 32);
auto timeStart2 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++)
Loops::loopReduce<float, float, float>(x.bufferAsT<float>(), tadShapeInfo, tadOffsets,
z2.bufferAsT<float>(), z2.getShapeInfo(),
nullptr,
&simdOps::Mean<float,float>::startingValue,
&simdOps::Mean<float,float>::update,
&simdOps::Mean<float,float>::op,
&simdOps::Mean<float,float>::postProcess);
auto timeEnd2 = std::chrono::system_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd2 - timeStart2) / N).count();
RELEASE(tadShapeInfo, x.getWorkspace());
RELEASE(tadOffsets, x.getWorkspace());
// z1.printIndexedBuffer("z1 ");
// z2.printIndexedBuffer("z2 ");
ASSERT_TRUE(z1.equalsTo(z2));
printf("duration old: %ld\n", duration1);
printf("duration new: %ld\n", duration2);
printf("duration E: %ld\n", durationE);
*/
}
////////////////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, newTads_1) {
const int N = 1;
Nd4jLong shapeInfo[] = {4, 1024,1024,1024,1024, 1024*1024*1024,1024*1024,1024,1, 16384,1,99};
const int rank = shape::rank(shapeInfo);
const std::vector<int> dimsToExclude = {1,3};
const std::vector<int> tadDims = {0,2};
const bool keepUnitesInShape = false;
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(shapeInfo, dimsToExclude);
const int subArrRank = (rank == dimsToExclude.size() || keepUnitesInShape) ? rank : rank - dimsToExclude.size();
auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)];
auto oPtr = new Nd4jLong[numOfSubArrs];
// warm up
for (int i = 0; i < 1000; ++i)
32*512;
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++)
auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(shapeInfo, tadDims);
auto timeEnd = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
printf("duration old: %ld\n", duration);
delete []sPtr;
delete []oPtr;
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loops_2) {
const uint N = 5;
const Nd4jLong dim0(10), dim1(10), dim2(10);
const Nd4jLong shapeInfo[2*3+4] = {3, dim0,dim1,dim2, 1,dim0,dim0*dim1, 8192,1,102};
const Nd4jLong len = shape::length(shapeInfo);
float* buff = new float[len];
const Nd4jLong* shape = shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo));
const Nd4jLong* strides = shape::stride(const_cast<Nd4jLong*>(shapeInfo));
// OmpLaunchHelper threadsInfo(len);
Nd4jLong *xOffsets, *yOffsets, *zOffsets;
xOffsets = new Nd4jLong[len];
yOffsets = new Nd4jLong[len];
zOffsets = new Nd4jLong[len];
// warm up
for (int i = 0; i < 1000; ++i) 32*512;
//***********************************
//***********************************
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N; ++i)
{
PRAGMA_OMP_PARALLEL_SECTIONS
{
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, xOffsets);
}
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, yOffsets);
}
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, zOffsets);
}
}
PRAGMA_OMP_PARALLEL_FOR_SIMD
for (uint i = 0; i < len; i++)
buff[zOffsets[i]] = buff[xOffsets[i]] * buff[yOffsets[i]];
}
auto timeEnd = std::chrono::system_clock::now();
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N) .count();
//***********************************
//***********************************
timeStart = std::chrono::system_clock::now();
uint xShapeInfoCast[MAX_RANK];
uint yShapeInfoCast[MAX_RANK];
uint zShapeInfoCast[MAX_RANK];
bool canCastX = DataTypeUtils::castShapeInfo(shapeInfo, xShapeInfoCast);
bool canCastY = DataTypeUtils::castShapeInfo(shapeInfo, yShapeInfoCast);
bool canCastZ = DataTypeUtils::castShapeInfo(shapeInfo, zShapeInfoCast);
for (int i = 0; i < N; ++i)
{
PRAGMA_OMP_PARALLEL_FOR_SIMD
for (uint i = 0; i < len; i++) {
auto xOffset = shape::indexOffset(i, shapeInfo, xShapeInfoCast, len, canCastX);
auto yOffset = shape::indexOffset(i, shapeInfo, yShapeInfoCast, len, canCastY);
auto zOffset = shape::indexOffset(i, shapeInfo, zShapeInfoCast, len, canCastZ);
buff[zOffset] = buff[xOffset] * buff[yOffset];
}
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
// for (uint i0 = 0; i0 < shape[0]; ++i0)
// for (uint i1 = 0; i1 < shape[1]; ++i1)
// for (uint i2 = 0; i2 < shape[2]; ++i2)
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
}
timeEnd = std::chrono::system_clock::now();
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
nd4j_printf("My time: %lld us;\n", myTime);
nd4j_printf("Old time: %lld us;\n", oldTime);
delete []xOffsets;
delete []yOffsets;
delete []zOffsets;
delete []buff;
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loops_3) {
const uint N = 5;
// const Nd4jLong dim0(1024), dim1(1024), dim2(1024);
const Nd4jLong dim0(10), dim1(10), dim2(10);
const Nd4jLong shapeInfo[2*3+4] = {3, dim0,dim1,dim2, dim1*dim2,dim2,1, 8192,1,99};
const Nd4jLong len = shape::length(shapeInfo);
float* buff = new float[len];
const Nd4jLong* shape = shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo));
const Nd4jLong* strides = shape::stride(const_cast<Nd4jLong*>(shapeInfo));
// warm up
for (int i = 0; i < 1000; ++i) 32*512;
//***********************************
//***********************************
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N; ++i)
{
Nd4jLong* idxX = new Nd4jLong[3];
Nd4jLong* idxY = new Nd4jLong[3];
Nd4jLong* idxZ = new Nd4jLong[3];
Nd4jLong* offsetPerDimX = new Nd4jLong[3];
Nd4jLong* offsetPerDimY = new Nd4jLong[3];
Nd4jLong* offsetPerDimZ = new Nd4jLong[3];
memset(idxX, 0, sizeof(Nd4jLong) * 3);
memset(idxY, 0, sizeof(Nd4jLong) * 3);
memset(idxZ, 0, sizeof(Nd4jLong) * 3);
PRAGMA_OMP_SIMD
for (int k = 0; k < 3; ++k) {
offsetPerDimX[k] = (shape[k] - 1) * strides[k];
offsetPerDimY[k] = (shape[k] - 1) * strides[k];
offsetPerDimZ[k] = (shape[k] - 1) * strides[k];
}
Nd4jLong initX(0), initY(0), initZ(0), offsetsX(0), offsetsY(0), offsetsZ(0);
Nd4jLong rankMinusOne(3 - 1), jX(rankMinusOne), jY(rankMinusOne), jZ(rankMinusOne);
// we do first iteration separately
buff[offsetsZ] = buff[offsetsX] * buff[offsetsY];
uint e = 1;
while (e < len) {
// printf("%lld, %lld, %lld\n", jX, jY, jZ);
if(shape[jX] == 1) { --jX; --jY; --jZ; continue; }
if(jX == rankMinusOne) { for(int l = 1; l < shape[jX]; ++l) {offsetsX += strides[jX]; ++e;} --jX; }
else if(idxX[jX] < shape[jX] - 1) {initX += strides[jX]; offsetsX = initX; ++idxX[jX]; jX = rankMinusOne; ++e;}
else {initX -= offsetPerDimX[jX]; idxX[jX--] = 0;}
if(jY == rankMinusOne) { for(int l = 1; l < shape[jY]; ++l) {offsetsY += strides[jY];} --jY; }
else if(idxY[jY] < shape[jY] - 1) {initY += strides[jY]; offsetsY = initY; ++idxY[jY]; jY = rankMinusOne; }
else {initY -= offsetPerDimY[jY]; idxY[jY--] = 0;}
if(jZ == rankMinusOne) { for(int l = 1; l < shape[jZ]; ++l) {offsetsZ += strides[jZ];} --jZ; }
else if(idxZ[jZ] < shape[jZ] - 1) {initZ += strides[jZ]; offsetsZ = initZ; ++idxZ[jZ]; jZ = rankMinusOne; }
else {initZ -= offsetPerDimZ[jZ]; idxZ[jZ--] = 0;}
buff[offsetsZ] = buff[offsetsX] * buff[offsetsY];
}
delete []idxX;
delete []idxY;
delete []idxZ;
delete []offsetPerDimX;
delete []offsetPerDimY;
delete []offsetPerDimZ;
}
auto timeEnd = std::chrono::system_clock::now();
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N) .count();
//***********************************
//***********************************
timeStart = std::chrono::system_clock::now();
// uint xShapeInfoCast[MAX_RANK];
// uint yShapeInfoCast[MAX_RANK];
// uint zShapeInfoCast[MAX_RANK];
// bool canCastX = DataTypeUtils::castShapeInfo(shapeInfo, xShapeInfoCast);
// bool canCastY = DataTypeUtils::castShapeInfo(shapeInfo, yShapeInfoCast);
// bool canCastZ = DataTypeUtils::castShapeInfo(shapeInfo, zShapeInfoCast);
// for (int i = 0; i < N; ++i)
// {
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
// for (uint i0 = 0; i0 < shape[0]; ++i0)
// for (uint i1 = 0; i1 < shape[1]; ++i1)
// for (uint i2 = 0; i2 < shape[2]; ++i2)
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
// }
Nd4jLong *xOffsets, *yOffsets, *zOffsets;
xOffsets = new Nd4jLong[len];
yOffsets = new Nd4jLong[len];
zOffsets = new Nd4jLong[len];
for (int i = 0; i < N; ++i)
{
PRAGMA_OMP_PARALLEL_SECTIONS
{
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, xOffsets);
}
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, yOffsets);
}
PRAGMA_OMP_SECTION
{
shape::calcOffsets(3, shape, strides, zOffsets);
}
}
PRAGMA_OMP_PARALLEL_FOR_SIMD
for (uint i = 0; i < len; i++)
buff[zOffsets[i]] = buff[xOffsets[i]] * buff[yOffsets[i]];
}
timeEnd = std::chrono::system_clock::now();
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
delete []xOffsets;
delete []yOffsets;
delete []zOffsets;
nd4j_printf("My time: %lld us;\n", myTime);
nd4j_printf("Old time: %lld us;\n", oldTime);
delete []buff;
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loops_4) {
const uint N = 2;
// const Nd4jLong dim0(256), dim1(256), dim2(256), dim3(256);
const Nd4jLong dim0(10), dim1(10), dim2(10), dim3(10);
NDArray x('c', {dim0, dim1, dim2, dim3});
NDArray z('c', {dim0, dim2});
x = 0.1;
// warm up
for (int i = 0; i < 1000; ++i) 32*512;
auto timeStart = std::chrono::system_clock::now();
for (uint i = 0; i < N; ++i)
x.reduceAlongDimension(reduce::Sum, &z, {1,3});
auto timeEnd = std::chrono::system_clock::now();
auto myTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart) / N) .count();
nd4j_printf("My time: %lld us;\n", myTime);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, loops_5) {
const uint N = 2;
// const Nd4jLong dim0(1024), dim1(1024), dim2(256);
const Nd4jLong dim0(10), dim1(10), dim2(10);
NDArray x('c', {dim0, dim1, dim2});
NDArray z('c', {dim0, dim1, dim2});
// provide worst case
*shape::ews(x.shapeInfo()) = 0;
*shape::ews(z.shapeInfo()) = 0;
x = 0.1;
// warm up
for (int i = 0; i < 1000; ++i) 32*512;
auto timeStart = std::chrono::system_clock::now();
for (uint i = 0; i < N; ++i)
x.applyTransform(transform::Log, &z);
auto timeEnd = std::chrono::system_clock::now();
auto myTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart) / N) .count();
nd4j_printf("My time: %lld us;\n", myTime);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, im2col_1) {
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int bS=2, iH=4,iW=4, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1; // VALID
int oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1; // VALID
NDArray image('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
NDArray column('c', {bS, iC, kH, kW, oH, oW}, nd4j::DataType::FLOAT32);
nd4j::LaunchContext * context = image.getContext();
NDArray padValue (nd4j::DataType::FLOAT32, context); // scalar =0
image.linspace(1, 1);
const int N = 1;
// warm up
nd4j::ops::helpers::im2col(*context, image, column, kH, kW, sH, sW, pH, pW, dH, dW, padValue); // warm up
// ---------------------------------------- //
auto timeStart1 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++) {
nd4j::ops::helpers::im2col(*context, image, column, kH, kW, sH, sW, pH, pW, dH, dW, padValue);
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std::chrono::system_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd1 - timeStart1) / N).count();
printf("duration my %ld\n", duration1);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, im2col_2) {
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int bS=2, iH=4,iW=4, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1; // VALID
int oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1; // VALID
NDArray image('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
NDArray column('c', {bS, iC, kH, kW, oH, oW}, nd4j::DataType::FLOAT32);
nd4j::LaunchContext * context = image.getContext();
image.linspace(1, 1);
ExtraArguments extras(std::vector<double>({(double)kH, (double)kW, (double)sH, (double)sW, (double)pH, (double)pW, (double)dH, (double)dW, 0., 0.}));
const int N = 1;
// warm up
void* params = extras.argumentsAsT(column.dataType());
NativeOpExecutioner::execTransformSame(context, nd4j::transform::Im2col, image.buffer(), image.getShapeInfo(), image.getSpecialBuffer(), image.getSpecialShapeInfo(), column.buffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), params, nullptr, nullptr);
// ---------------------------------------- //
auto timeStart2 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++) {
NativeOpExecutioner::execTransformSame(context, nd4j::transform::Im2col,
image.buffer(), image.getShapeInfo(), image.getSpecialBuffer(), image.getSpecialShapeInfo(),
column.buffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(),
params,
nullptr, nullptr);
}
auto timeEnd2 = std::chrono::system_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd2 - timeStart2) / N).count();
printf("duration old %ld\n", duration2);
}
/*
TEST_F(PlaygroundTests, test_scatter_119) {
auto output = NDArrayFactory::create<float>('c', {65536, 512});
auto updates = NDArrayFactory::create<float>('c', {65536, 512});
auto indices = NDArrayFactory::create<int>('c', {65536});
int p = 0;
for (int e = 65534; e >= 0; e--)
indices.p(p++, e);
indices.syncToDevice();
int N = 1;
auto timeStart1 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++) {
helpers::scatter(LaunchContext::defaultContext(), pairwise::CopyPws, indices, updates, output, false);
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std::chrono::system_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd1 - timeStart1) / N).count();
nd4j_printf("duration my %ld\n", duration1);
}
TEST_F(PlaygroundTests, test_scatter_120) {
auto output = NDArrayFactory::create_<float>('c', {65536, 512});
auto updates = NDArrayFactory::create_<float>('c', {65536, 512});
auto indices = NDArrayFactory::create_<int>('c', {65536});
int p = 0;
for (int e = 65534; e >= 0; e--)
indices->p(p++, e);
indices->syncToDevice();
int N = 1;
auto timeStart1 = std::chrono::system_clock::now();
for (int i = 0; i < N ; i++) {
helpers::scatter(LaunchContext::defaultContext(), pairwise::CopyPws, *indices, *updates, *output, false);
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std::chrono::system_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd1 - timeStart1) / N).count();
nd4j_printf("duration my %ld\n", duration1);
delete output;
delete indices;
delete updates;
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, mmulMxM_1) {
const int numOfIters = 100;
const Nd4jLong M = 1024;
const Nd4jLong K = 1024;
const Nd4jLong N = 1024;
NDArray a('f', {M,K}, nd4j::DataType::FLOAT32);
NDArray b('f', {K,N}, nd4j::DataType::FLOAT32);
NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < numOfIters; ++i)
nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
auto timeEnd = std::chrono::system_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / numOfIters).count();
printf("duration %ld\n", duration1);
}
*/