2354 lines
86 KiB
C++
2354 lines
86 KiB
C++
/*******************************************************************************
|
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
*
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* License for the specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
******************************************************************************/
|
|
|
|
//
|
|
// Created by raver119 on 20.11.17.
|
|
//
|
|
|
|
#include "testlayers.h"
|
|
#include <Graph.h>
|
|
#include <chrono>
|
|
#include <Node.h>
|
|
#include <ops/declarable/CustomOperations.h>
|
|
#include <graph/profiling/GraphProfilingHelper.h>
|
|
#include <type_conversions.h>
|
|
#include <helpers/threshold.h>
|
|
#include <helpers/MmulHelper.h>
|
|
#include <ops/ops.h>
|
|
#include <OmpLaunchHelper.h>
|
|
#include <GradCheck.h>
|
|
#include <ops/declarable/helpers/im2col.h>
|
|
#include <Loops.h>
|
|
|
|
#include <helpers/BenchmarkHelper.h>
|
|
#include <ops/declarable/helpers/scatter.h>
|
|
#include <helpers/ConstantShapeHelper.h>
|
|
#include <helpers/ConstantTadHelper.h>
|
|
#include <array>
|
|
|
|
using namespace nd4j;
|
|
using namespace nd4j::graph;
|
|
|
|
class PlaygroundTests : public testing::Test {
|
|
public:
|
|
int numIterations = 3;
|
|
int poolSize = 10;
|
|
|
|
PlaygroundTests() {
|
|
printf("\n");
|
|
fflush(stdout);
|
|
}
|
|
};
|
|
|
|
/*
|
|
TEST_F(PlaygroundTests, LSTMBenchmarks_DebugTNS) {
|
|
|
|
BenchmarkHelper helper(5,10);
|
|
|
|
PredefinedParameters mb("mb", {1, 8, 64});
|
|
PredefinedParameters nInOut("nInOut", {32, 256, 1024});
|
|
|
|
ParametersBatch batch({&mb, &nInOut});
|
|
nd4j::ops::lstmBlock lstmBlock;
|
|
DeclarableBenchmark benchmark(lstmBlock, "lstm");
|
|
|
|
int seqLength = 64;
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int m = p.getIntParam("mb");
|
|
int n = p.getIntParam("nInOut");
|
|
|
|
Nd4jLong l = 0;
|
|
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l)); //Max TS length (unused)
|
|
|
|
|
|
//TNS format
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //x
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //i
|
|
ctx->setOutputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //c
|
|
ctx->setOutputArray(2, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //f
|
|
ctx->setOutputArray(3, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //o
|
|
ctx->setOutputArray(4, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //z
|
|
ctx->setOutputArray(5, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //h
|
|
ctx->setOutputArray(6, NDArrayFactory::create_<float>('c', {seqLength, m, n})); //y
|
|
|
|
auto cLast = NDArrayFactory::create_<float>('c', {m, n});
|
|
auto yLast = NDArrayFactory::create_<float>('c', {m, n});
|
|
auto W = NDArrayFactory::create_<float>('c', {2 * n, 4 * n});
|
|
auto Wci = NDArrayFactory::create_<float>('c', {n});
|
|
auto Wcf = NDArrayFactory::create_<float>('c', {n});
|
|
auto Wco = NDArrayFactory::create_<float>('c', {n});
|
|
auto b = NDArrayFactory::create_<float>('c', {4 * n});
|
|
|
|
ctx->setInputArray(2, cLast);
|
|
ctx->setInputArray(3, yLast);
|
|
ctx->setInputArray(4, W);
|
|
ctx->setInputArray(5, Wci);
|
|
ctx->setInputArray(6, Wcf);
|
|
ctx->setInputArray(7, Wco);
|
|
ctx->setInputArray(8, b);
|
|
|
|
Nd4jLong *iargs = new Nd4jLong[2];
|
|
iargs[0] = 0; //No peephole
|
|
iargs[1] = 0; //TNS
|
|
ctx->setIArguments(iargs, 2);
|
|
delete[] iargs;
|
|
double *targs = new double[2];
|
|
targs[0] = 1.0; //forget bias
|
|
targs[1] = 0.0; //cell clipping value
|
|
ctx->setTArguments(targs, 2);
|
|
delete[] targs;
|
|
return ctx;
|
|
};
|
|
|
|
helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, BroadcastOps2d) {
|
|
BenchmarkHelper helper;
|
|
|
|
PredefinedParameters rows("rows", {1024, 1048576});
|
|
IntPowerParameters cols("cols", 2, 2, 10, 2); //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
|
|
BoolParameters axis("axis");
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&rows, &cols, &axis, &inplace});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
nd4j_printf("Entered generator\n","");
|
|
auto a = p.getIntParam("axis");
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
|
|
nd4j_printf("Created first array: [%lld, %lld]\n",arr->sizeAt(0), arr->sizeAt(1));
|
|
|
|
auto ctx = new Context(1);
|
|
ctx->setInputArray(0, arr, true);
|
|
if(a == 0){
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), 1}), true);
|
|
nd4j_printf("Created second array (a=0): [%lld, %lld]\n",ctx->getNDArray(1)->sizeAt(0), ctx->getNDArray(1)->sizeAt(1));
|
|
} else {
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
|
|
nd4j_printf("Created second array (a=1): [%lld, %lld]\n",ctx->getNDArray(1)->sizeAt(0), ctx->getNDArray(1)->sizeAt(1));
|
|
}
|
|
if (p.getIntParam("inplace") == 1) {
|
|
ctx->setOutputArray(0, arr, false);
|
|
ctx->markInplace(true);
|
|
nd4j_printf("Set result array (inplace)\n","");
|
|
} else {
|
|
auto out = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
|
|
ctx->setOutputArray(0, out, true);
|
|
nd4j_printf("Created and set result array (not inplace): [%lld, %lld]\n",out->sizeAt(0), out->sizeAt(1));
|
|
}
|
|
return ctx;
|
|
};
|
|
|
|
std::string s("add");
|
|
nd4j::ops::add op;
|
|
DeclarableBenchmark benchmark(op, "add");
|
|
nd4j_printf("About to execute\n","");
|
|
helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
|
|
}
|
|
*/
|
|
|
|
TEST_F(PlaygroundTests, test_small_reductions) {
|
|
auto f = NDArrayFactory::create<float>('c', {1024 ,1024});
|
|
f.assign(1.0f);
|
|
|
|
int iterations = 1;
|
|
std::vector<Nd4jLong> results(iterations);
|
|
Nd4jLong mean = 0L;
|
|
Nd4jLong max = 0L;
|
|
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto x = NDArrayFactory::create<float>('c', {4, 64});
|
|
auto z = NDArrayFactory::create<float>('c', {64});
|
|
x.assign(1.0f);
|
|
int axis = 0;
|
|
|
|
auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), axis);
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext ::defaultContext(), reduce::Mean, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), nullptr, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &axis, 1, tadPack.primaryShapeInfo(), tadPack.primaryOffsets());
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
|
|
results[e] = duration;
|
|
mean += duration;
|
|
|
|
if (duration > max)
|
|
max = duration;
|
|
|
|
if (duration < min)
|
|
min = duration;
|
|
}
|
|
|
|
mean /= iterations;
|
|
std::sort(results.begin(), results.end());
|
|
|
|
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_PermutedArray_Operation_1) {
|
|
auto x = NDArrayFactory::create<float>('c',{64, 32, 4, 32});
|
|
auto z = NDArrayFactory::create<float>('c', {4, 64, 32, 32});
|
|
x.assign(1.0f);
|
|
|
|
x.permutei({2, 0, 3, 1});
|
|
|
|
//x.printShapeInfo("x");
|
|
|
|
int iterations = 1;
|
|
std::vector<Nd4jLong> results(iterations);
|
|
Nd4jLong mean = 0L;
|
|
Nd4jLong max = 0L;
|
|
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
|
|
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
NativeOpExecutioner::execTransformStrict(LaunchContext::defaultContext(), transform::StrictOps::Sin, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr, nullptr);
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
|
|
results[e] = duration;
|
|
mean += duration;
|
|
|
|
if (duration > max)
|
|
max = duration;
|
|
|
|
if (duration < min)
|
|
min = duration;
|
|
}
|
|
|
|
mean /= iterations;
|
|
std::sort(results.begin(), results.end());
|
|
|
|
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
|
|
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_PermutedArray_Operation_2) {
|
|
|
|
//x.printShapeInfo("x");
|
|
|
|
int iterations = 100;
|
|
std::vector<Nd4jLong> results(iterations);
|
|
Nd4jLong mean = 0L;
|
|
Nd4jLong max = 0L;
|
|
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
|
|
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
Nd4jLong eShapeInfo[] = {2, 8, 256, 256, 1, 8192, 1, 99};
|
|
Nd4jLong xShapeInfo[] = {2, 8, 256, 1024, 1, 8192, 0, 99};
|
|
Nd4jLong yShapeInfo[] = {2, 8, 256, 256, 1, 8192, 1, 99};
|
|
float xBuff[8*1024];
|
|
|
|
NDArray x(xBuff, xShapeInfo);
|
|
//NDArray x(eShapeInfo, nd4j::DataType::FLOAT32, true);
|
|
NDArray z(yShapeInfo, nd4j::DataType::FLOAT32, true);
|
|
x.linspace(0.1f, 0.01f);
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
NativeOpExecutioner::execTransformStrict(LaunchContext::defaultContext(), transform::StrictOps::Tanh, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr, nullptr);
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)).count();
|
|
results[e] = duration;
|
|
mean += duration;
|
|
|
|
if (duration > max)
|
|
max = duration;
|
|
|
|
if (duration < min)
|
|
min = duration;
|
|
}
|
|
|
|
mean /= iterations;
|
|
std::sort(results.begin(), results.end());
|
|
|
|
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
|
|
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_reduce_3) {
|
|
// auto x = NDArrayFactory::create<float>('c', {4096, 8192});
|
|
// auto y = NDArrayFactory::create<float>('c', {8192});
|
|
// auto z = NDArrayFactory::create<float>('c', {4096});
|
|
|
|
auto x = NDArrayFactory::create<float>('c', {16, 32});
|
|
auto y = NDArrayFactory::create<float>('c', {32});
|
|
auto z = NDArrayFactory::create<float>('c', {16});
|
|
|
|
auto dim = NDArrayFactory::create<int>('c', {1}, {1});
|
|
auto iterations = 100;
|
|
std::vector<Nd4jLong> results(iterations);
|
|
Nd4jLong mean = 0L;
|
|
Nd4jLong max = 0L;
|
|
Nd4jLong min = DataTypeUtils::max<Nd4jLong>();
|
|
|
|
NativeOps nativeOps;
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
nativeOps.execReduce3(nullptr, reduce3::CosineDistance, x.buffer(), x.shapeInfo(), x.specialBuffer(),
|
|
x.specialShapeInfo(), nullptr, y.buffer(), y.shapeInfo(), y.specialBuffer(),
|
|
y.specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
|
|
dim.buffer(), dim.shapeInfo(), dim.specialBuffer(), dim.specialShapeInfo(), nullptr,
|
|
nullptr, nullptr, nullptr);
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)).count();
|
|
results[e] = duration;
|
|
mean += duration;
|
|
|
|
if (duration > max)
|
|
max = duration;
|
|
|
|
if (duration < min)
|
|
min = duration;
|
|
}
|
|
|
|
mean /= iterations;
|
|
std::sort(results.begin(), results.end());
|
|
|
|
nd4j_printf("Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld]\n", results[results.size() / 2], mean, min, max);
|
|
}
|
|
|
|
/*
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_1) {
|
|
|
|
BenchmarkHelper helper;
|
|
|
|
ScalarBenchmark sb1(scalar::Add, "add", NDArrayFactory::create_<float>('c', {100, 100}), NDArrayFactory::create_<float>(1.0f), NDArrayFactory::create_<float>('c', {100, 100}));
|
|
ScalarBenchmark sb2(scalar::Add, "add", NDArrayFactory::create_<float>('c', {1000, 1000}), NDArrayFactory::create_<float>(1.0f), NDArrayFactory::create_<float>('c', {1000, 1000}));
|
|
|
|
helper.runOperationSuit({&sb1, &sb2}, "ScalarAdd");
|
|
}
|
|
|
|
|
|
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_2) {
|
|
|
|
BenchmarkHelper helper;
|
|
Parameters parameters;
|
|
parameters.addBoolParam("fOrder", true);
|
|
float scalar = 2.0f;
|
|
|
|
auto fa = NDArrayFactory::create<int>(1);
|
|
|
|
ScalarBenchmark sb(scalar::Multiply);
|
|
|
|
// Y will be shared
|
|
sb.setY(NDArrayFactory::create_<float>(scalar));
|
|
|
|
auto generator = GENERATE_XZ() {
|
|
// operands go together line by line
|
|
x.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
|
|
|
|
x.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
|
|
|
|
// only share within single op call. do not cross share
|
|
auto shared = NDArrayFactory::create_<float>('c', {256, 768});
|
|
x.push_back(shared);
|
|
z.push_back(shared);
|
|
|
|
// using bool param here
|
|
if (parameters.getBoolParam("fOrder")) {
|
|
x.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {1000, 1000}));
|
|
}
|
|
|
|
//another way to call inplace op
|
|
x.push_back(NDArrayFactory::create_<float>('c', {100, 100}));
|
|
z.push_back(nullptr);
|
|
|
|
};
|
|
|
|
helper.runOperationSuit(&sb, generator, "ScalarTest");
|
|
|
|
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
|
|
|
|
|
|
// we can use the same generator, since the same number of operands used
|
|
helper.runOperationSuit(&tb, generator, "TransformTest");
|
|
|
|
PairwiseBenchmark pb(pairwise::Pow, "pow test");
|
|
|
|
auto generatorXYZ = GENERATE_XYZ() {
|
|
x.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
|
|
y.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
|
|
|
|
x.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
|
|
y.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
|
|
z.push_back(NDArrayFactory::create_<float>('f', {100, 1000}));
|
|
};
|
|
|
|
helper.runOperationSuit(&pb, generatorXYZ, "PairwiseTest");
|
|
|
|
auto generatorReductionAxis = GENERATE_XYZ() {
|
|
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
|
|
|
|
// axis goes to y here
|
|
y.push_back(NDArrayFactory::create_<int>(0));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {1000}));
|
|
|
|
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
|
|
y.push_back(NDArrayFactory::create_<int>(1));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {100}));
|
|
|
|
// scalar case
|
|
x.push_back(NDArrayFactory::create_<float>('c', {100, 1000}));
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
|
|
|
|
ReductionBenchmark rb(reduce::FloatOps::Mean);
|
|
|
|
helper.runOperationSuit(&rb, (const std::function<void (ResultSet &, ResultSet &, ResultSet &)>)(generatorReductionAxis), "ReductionAlongDimensionTest");
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_3) {
|
|
|
|
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
|
|
PredefinedParameters a("alpha", {2, 3, 4});
|
|
PredefinedParameters b("beta", {9, 15, 27});
|
|
|
|
ParametersBatch batch({&a, &b});
|
|
|
|
auto parameters = batch.parameters();
|
|
ASSERT_EQ(9, parameters.size());
|
|
|
|
auto params_0 = parameters[0];
|
|
ASSERT_EQ(2, params_0.getIntParam("alpha"));
|
|
ASSERT_EQ(9, params_0.getIntParam("beta"));
|
|
|
|
auto params_1 = parameters[1];
|
|
ASSERT_EQ(2, params_1.getIntParam("alpha"));
|
|
ASSERT_EQ(15, params_1.getIntParam("beta"));
|
|
|
|
auto params_3 = parameters[3];
|
|
ASSERT_EQ(3, params_3.getIntParam("alpha"));
|
|
ASSERT_EQ(9, params_3.getIntParam("beta"));
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_4) {
|
|
|
|
BenchmarkHelper helper;
|
|
|
|
PairwiseBenchmark pb(pairwise::Ops::Add, "PWT ADD");
|
|
TransformBenchmark tb(transform::StrictOps::Tanh, "tanh");
|
|
ScalarBenchmark sb(scalar::Multiply);
|
|
sb.setY(NDArrayFactory::create_<float>(119.0f));
|
|
|
|
PredefinedParameters a("alpha", {2, 3, 4});
|
|
PredefinedParameters b("beta", {9, 15, 27});
|
|
ParametersBatch batch({&a, &b});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
// operands go together line by line
|
|
x.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha") , p.getIntParam("beta")}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
|
|
};
|
|
|
|
auto generatorXYZ = PARAMETRIC_XYZ() {
|
|
// operands go together line by line
|
|
x.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha") , p.getIntParam("beta")}));
|
|
y.push_back(NDArrayFactory::create_<float>('f', {p.getIntParam("alpha") , p.getIntParam("beta")}));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
|
|
};
|
|
|
|
helper.runOperationSuit(&tb, generator, batch, "TransformTanh");
|
|
helper.runOperationSuit(&sb, generator, batch, "ScalarMultiply");
|
|
helper.runOperationSuit(&pb, generatorXYZ, batch, "PairwiseAdd");
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_5) {
|
|
BenchmarkHelper helper;
|
|
|
|
TransformBenchmark tb(transform::StrictOps::Sigmoid, "sigmoid");
|
|
IntParameters length("length", 100, 500, 100);
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
// operands go together line by line
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
x.push_back(arr);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
helper.runOperationSuit(&tb, generator, batch, "Transform_Sigmoid");
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_Something_5) {
|
|
auto x = NDArrayFactory::create<float>('c', {100, 10});
|
|
auto y = NDArrayFactory::create<float>('c', {10});
|
|
auto z = NDArrayFactory::create<float>('c', {100, 10});
|
|
std::vector<int> axis = {1};
|
|
|
|
NativeOpExcutioner::execBroadcast(broadcast::Add, x.buffer(), x.shapeInfo(), y.buffer(), y.shapeInfo(), z.buffer(), z.shapeInfo(),
|
|
axis.data(), axis.size(), nullptr, nullptr,
|
|
nullptr, nullptr);
|
|
}
|
|
|
|
#define PARAMETRIC_D() [&] (Parameters &p) -> Context*
|
|
/*
|
|
TEST_F(PlaygroundTests, Test_OpBenchmark_6) {
|
|
BenchmarkHelper helper;
|
|
nd4j::ops::softmax op;
|
|
DeclarableBenchmark db(op, "SoftMaxTest");
|
|
|
|
PredefinedParameters a("alpha", {128, 256});
|
|
PredefinedParameters b("beta", {1024, 2048});
|
|
ParametersBatch batch({&a, &b});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("alpha"), p.getIntParam("beta")}));
|
|
return ctx;
|
|
};
|
|
|
|
helper.runOperationSuit(&db, generator, batch, "parametrized softmax test");
|
|
}
|
|
*/
|
|
|
|
/*
|
|
TEST_F(PlaygroundTests, Test_Strided_Stuff) {
|
|
auto array = NDArrayFactory::create<float>('c', {1048576, 1024});
|
|
auto strided = array({0,0, 3, 4}, true);
|
|
auto z = NDArrayFactory::create<float>(0.0f);
|
|
//strided->shapeInfo()[shape::shapeInfoLength(strided->rankOf()) - 2] = 1024;
|
|
|
|
int N = 1000;
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < N; e++)
|
|
NativeOpExcutioner::execReduceSameScalar(reduce::ReduceSameBenchmarkOp, strided.buffer(), strided.shapeInfo(), nullptr, z.buffer(), z.shapeInfo());
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)).count();
|
|
|
|
nd4j_printf("average time: %lld us;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
|
|
}
|
|
*/
|
|
|
|
/*
|
|
TEST_F(PlaygroundTests, StridedReductionsNoEWS) {
|
|
nd4j_printf("SETTING ELEMENTWISE THRESHOLD AND TAD THRESHOLD TO 1/1","");
|
|
nd4j::Environment::getInstance()->setElementwiseThreshold(1);
|
|
nd4j::Environment::getInstance()->setTadThreshold(1);
|
|
BenchmarkHelper helper;
|
|
IntPowerParameters stride("stride", 2, 0, 10); //2^0=1, ..., 2^10=1024
|
|
ParametersBatch batch({&stride});
|
|
//This is an edge case: technically an EWS *should* be available here
|
|
auto generator1 = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {1048576 + (stride == 1 ? 0 : 1), stride});
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
strided = new NDArray((*arr)({0,1048576, 0,1}, true)); //All rows, first column
|
|
}
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
|
|
helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator1), batch, "Strided Sum - No EWS Test 1");
|
|
//No EWS defined for this case
|
|
auto generator2 = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride});
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
strided = new NDArray((*arr)({0,2*1024,2, 0,0,0, 0,1,1}, true, true));
|
|
}
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS");
|
|
helper.runOperationSuit(&rbSum2, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator2), batch, "Strided Sum - No EWS Test 2");
|
|
}
|
|
*/
|
|
|
|
#ifndef __CUDABLAS__
|
|
|
|
TEST_F(PlaygroundTests, LambdaTest_1) {
|
|
auto array = NDArrayFactory::create<float>('c', {8192, 1024});
|
|
array.linspace(1);
|
|
|
|
auto lambda = LAMBDA_F(_x) {
|
|
return _x + 32.12f;
|
|
};
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < numIterations; e++) {
|
|
array.applyLambda<float>(lambda);
|
|
}
|
|
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Lambda 1 time %lld us\n", outerTime / numIterations);
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST_F(PlaygroundTests, LambdaTest_2) {
|
|
auto array = NDArrayFactory::create<float>('c', {8192, 1024});
|
|
auto row = NDArrayFactory::create<float>('c', {1, 1024});
|
|
array.linspace(1);
|
|
|
|
auto lambda = LAMBDA_F(_x) {
|
|
return _x + 32.12f;
|
|
};
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < numIterations; e++) {
|
|
array.applyBroadcast(broadcast::Add, {1}, &row);
|
|
}
|
|
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Broadcast time %lld us\n", outerTime / numIterations);
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST_F(PlaygroundTests, NoCacheTest_1) {
|
|
std::vector<NDArray*> pool(poolSize);
|
|
auto source = NDArrayFactory::create<float>('c', {8192, 1024});
|
|
for (int e = 0; e < pool.size(); e++)
|
|
pool[e] = source.dup();
|
|
|
|
auto lambda = LAMBDA_F(_x) {
|
|
return _x * 32.12f;
|
|
};
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
int cnt = 0;
|
|
for (int e = 0; e < numIterations; e++) {
|
|
auto v = pool[poolSize - 1 - (cnt++)];
|
|
v->applyLambda<float>(lambda);
|
|
|
|
if (cnt == poolSize)
|
|
cnt = 0;
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Non-cached time %lld us\n", outerTime / numIterations);
|
|
|
|
for (auto v: pool)
|
|
delete v;
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, NoCacheTest_2) {
|
|
std::vector<NDArray*> pool1(poolSize);
|
|
std::vector<NDArray*> pool2(poolSize);
|
|
auto source = NDArrayFactory::create<float>('c', {8192, 1024});
|
|
for (int e = 0; e < pool1.size(); e++) {
|
|
pool1[e] = source.dup();
|
|
pool2[e] = source.dup();
|
|
}
|
|
|
|
auto lambda = LAMBDA_FF(_x, _y) {
|
|
return _x * 32.12f + _y;
|
|
};
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
int cnt = 0;
|
|
for (int e = 0; e < numIterations; e++) {
|
|
auto v1 = pool1[poolSize - 1 - cnt];
|
|
auto v2 = pool2[cnt++];
|
|
v1->applyPairwiseLambda<float>(v2, lambda);
|
|
|
|
if (cnt == poolSize)
|
|
cnt = 0;
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Non-cached PWT time %lld us\n", outerTime / numIterations);
|
|
|
|
for (auto v: pool1)
|
|
delete v;
|
|
|
|
for (auto v: pool2)
|
|
delete v;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
TEST_F(PlaygroundTests, ReductionTest_1) {
|
|
std::vector<NDArray*> pool1(poolSize);
|
|
std::vector<NDArray*> pool2(poolSize);
|
|
auto source = NDArrayFactory::create<float>('c', {1, 100});
|
|
for (int e = 0; e < pool1.size(); e++) {
|
|
pool1[e] = source.dup();
|
|
pool2[e] = source.dup();
|
|
}
|
|
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
int cnt = 0;
|
|
for (int e = 0; e < 1; e++) {
|
|
auto v = pool1[poolSize - 1 - cnt];
|
|
auto r = v->sumNumber();
|
|
|
|
if (cnt == poolSize)
|
|
cnt = 0;
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
|
|
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Non-cached reduction time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
|
|
|
|
for (auto v: pool1)
|
|
delete v;
|
|
|
|
for (auto v: pool2)
|
|
delete v;
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, ScalarTest_1) {
|
|
std::vector<NDArray*> pool1(poolSize);
|
|
std::vector<NDArray*> pool2(poolSize);
|
|
auto source = NDArrayFactory::create<float>('c', {1, 100});
|
|
for (int e = 0; e < pool1.size(); e++) {
|
|
pool1[e] = source.dup();
|
|
pool2[e] = source.dup();
|
|
}
|
|
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
int cnt = 0;
|
|
float *buff = reinterpret_cast<float*>(source.buffer());
|
|
for (int e = 0; e < 100; e++) {
|
|
//auto v = pool1[poolSize - 1 - cnt];
|
|
//v->template applyScalar<simdOps::Add<float>>(2.0f);
|
|
source.applyScalar(scalar::Add,2.0f);
|
|
//functions::scalar::ScalarTransform<float>::template transformEx<simdOps::Add<float>>(source.buffer(), 1, source.buffer(), 1, 2.0f, nullptr, source.lengthOf());
|
|
//functions::scalar::ScalarTransform<float>::template transform<simdOps::Add<float>>(buff, 1, buff, 1, 2.0f, nullptr, 100);
|
|
|
|
cnt++;
|
|
if (cnt == poolSize)
|
|
cnt = 0;
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
|
|
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Cached scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000L, outerTimeMs);
|
|
|
|
for (auto v: pool1)
|
|
delete v;
|
|
|
|
for (auto v: pool2)
|
|
delete v;
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, ScalarTest_2) {
|
|
std::vector<NDArray*> pool1(poolSize);
|
|
std::vector<NDArray*> pool2(poolSize);
|
|
auto source = NDArrayFactory::create<float>('c', {1, 100});
|
|
for (int e = 0; e < pool1.size(); e++) {
|
|
pool1[e] = source.dup();
|
|
pool2[e] = source.dup();
|
|
}
|
|
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
int cnt = 0;
|
|
float * array = reinterpret_cast<float*>(source.buffer());
|
|
for (int e = 0; e < 1000; e++) {
|
|
|
|
PRAGMA_OMP_SIMD
|
|
for (int i = 0; i < source.lengthOf(); i++) {
|
|
array[i] = simdOps::Add<float, float, float>::op(array[i], 2.0f);
|
|
}
|
|
|
|
cnt++;
|
|
if (cnt == poolSize)
|
|
cnt = 0;
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
|
|
auto outerTimeMs = std::chrono::duration_cast<std::chrono::milliseconds> (timeEnd - timeStart).count();
|
|
|
|
// nd4j_printf("Cached manual scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
|
|
|
|
for (auto v: pool1)
|
|
delete v;
|
|
|
|
for (auto v: pool2)
|
|
delete v;
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_Profile_1) {
|
|
GraphProfile prof;
|
|
|
|
prof.setBuildTime(70);
|
|
prof.setExecutionTime(130);
|
|
|
|
prof.startEvent("omega");
|
|
prof.spotEvent("alpha");
|
|
prof.spotEvent("beta");
|
|
prof.spotEvent("gamma");
|
|
prof.recordEvent("omega");
|
|
|
|
auto nodeA = prof.nodeById(1, "MatMul");
|
|
auto nodeB = prof.nodeById(2, "Sum");
|
|
auto nodeC = prof.nodeById(3, "Conv2D");
|
|
|
|
nodeA->setObjectsSize(512);
|
|
nodeA->setTemporarySize(65536);
|
|
nodeA->setActivationsSize(512387);
|
|
nodeA->setPreparationTime(127);
|
|
nodeA->setExecutionTime(6539);
|
|
|
|
|
|
nodeB->setObjectsSize(0);
|
|
nodeB->setTemporarySize(0);
|
|
nodeB->setActivationsSize(512387);
|
|
nodeB->setPreparationTime(132);
|
|
nodeB->setExecutionTime(2047);
|
|
|
|
|
|
nodeC->setObjectsSize(1536);
|
|
nodeC->setTemporarySize(2355674);
|
|
nodeC->setActivationsSize(1022092);
|
|
nodeC->setPreparationTime(129);
|
|
nodeC->setExecutionTime(12983);
|
|
|
|
// prof.printOut();
|
|
}
|
|
|
|
#ifdef GRAPH_FILES_OK
|
|
TEST_F(PlaygroundTests, Test_Profile_2) {
|
|
Environment::getInstance()->setProfiling(true);
|
|
auto graph = GraphExecutioner::importFromFlatBuffers("./resources/ae_00.fb");
|
|
|
|
auto profile = GraphProfilingHelper::profile(graph, 2);
|
|
// profile->printOut();
|
|
|
|
delete graph;
|
|
delete profile;
|
|
}
|
|
#endif
|
|
|
|
TEST_F(PlaygroundTests, Test_Im2Col_1) {
|
|
|
|
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
|
|
int oH=55, oW=55;
|
|
int iterations = 1;
|
|
|
|
auto input = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
|
|
auto output = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
|
|
|
|
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
|
|
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
|
|
|
|
nd4j::ops::im2col op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&input}, {&output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
// outputPermuted.printShapeInfo("permuted shape");
|
|
|
|
auto permStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&input}, {&outputPermuted}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto permEnd = std::chrono::system_clock::now();
|
|
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
|
|
|
|
|
|
auto legacyStart = std::chrono::system_clock::now();
|
|
|
|
ExtraArguments extra({(double)kH, (double)kW, (double)sH, (double)sW, (double)pH, (double)pW, (double)dH, (double)dW, (double) 0.f, (double)0.f});
|
|
for (int e = 0; e < iterations; e++) {
|
|
input.applyTransform(transform::Im2col, &output, &extra);
|
|
}
|
|
|
|
auto legacyEnd = std::chrono::system_clock::now();
|
|
auto legacyTime = std::chrono::duration_cast<std::chrono::microseconds> (legacyEnd - legacyStart).count();
|
|
|
|
|
|
auto legacyPermStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
input.applyTransform(transform::Im2col, &outputPermuted, &extra);
|
|
}
|
|
|
|
auto legacyPermEnd = std::chrono::system_clock::now();
|
|
auto legacyPermTime = std::chrono::duration_cast<std::chrono::microseconds> (legacyPermEnd - legacyPermStart).count();
|
|
|
|
|
|
NativeOps nativeOps;
|
|
|
|
Nd4jLong iArgs[] = {kH, kW, sH, sW, pH, pW, dH, dW, 0};
|
|
Nd4jPointer inputBuffers[] = {input.buffer()};
|
|
Nd4jPointer inputShapes[] = {input.shapeInfo()};
|
|
|
|
Nd4jPointer outputBuffers[] = {output.buffer()};
|
|
Nd4jPointer outputShapes[] = {output.shapeInfo()};
|
|
|
|
auto javaStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputBuffers, outputShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
|
|
}
|
|
|
|
auto javaEnd = std::chrono::system_clock::now();
|
|
auto javaTime = std::chrono::duration_cast<std::chrono::microseconds> (javaEnd - javaStart).count();
|
|
|
|
|
|
Nd4jPointer outputPermBuffers[] = {outputPermuted.buffer()};
|
|
Nd4jPointer outputPermShapes[] = {outputPermuted.shapeInfo()};
|
|
|
|
auto javaPermStart = std::chrono::system_clock::now();
|
|
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputPermBuffers, outputPermShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
|
|
}
|
|
|
|
auto javaPermEnd = std::chrono::system_clock::now();
|
|
auto javaPermTime = std::chrono::duration_cast<std::chrono::microseconds> (javaPermEnd - javaPermStart).count();
|
|
|
|
// nd4j_printf("New time: %lld us;\n", outerTime / iterations);
|
|
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
|
|
// nd4j_printf("Legacy time: %lld us;\n", legacyTime / iterations);
|
|
// nd4j_printf("Legacy Permuted time: %lld us;\n", legacyPermTime / iterations);
|
|
// nd4j_printf("Java time: %lld us;\n", javaTime / iterations);
|
|
// nd4j_printf("Java Permuted time: %lld us;\n", javaPermTime / iterations);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_Im2Col_2) {
|
|
auto input = NDArrayFactory::create<float>('c', {16, 3, 224, 224});
|
|
auto output = NDArrayFactory::create<float>('c', {16, 3, 11, 11, 55, 55});
|
|
|
|
auto outputPermuted = NDArrayFactory::create<float>('c', {16, 55, 55, 3, 11, 11});
|
|
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
|
|
|
|
nd4j::ops::im2col op;
|
|
|
|
Nd4jLong iArgs[] = {11, 11, 4, 4, 2, 2, 1, 1, 0};
|
|
Nd4jPointer inputBuffers[] = {input.buffer()};
|
|
Nd4jPointer inputShapes[] = {input.shapeInfo()};
|
|
|
|
Nd4jPointer outputPermBuffers[] = {outputPermuted.buffer()};
|
|
Nd4jPointer outputPermShapes[] = {outputPermuted.shapeInfo()};
|
|
|
|
NativeOps nativeOps;
|
|
|
|
nativeOps.execCustomOp(nullptr, op.getOpHash(), inputBuffers, inputShapes, 1, outputPermBuffers, outputPermShapes, 1, nullptr, 0, iArgs, 9, nullptr, 0, false);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_Col2Im_1) {
|
|
|
|
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
|
|
int oH=55, oW=55;
|
|
int iterations = 1;
|
|
|
|
auto input = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
|
|
auto output = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
|
|
|
|
auto inputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
|
|
inputPermuted.permutei({0, 3, 4, 5, 1, 2});
|
|
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
|
|
outputPermuted.permutei({0, 3, 1, 2});
|
|
|
|
input = 10.;
|
|
output = 2.;
|
|
|
|
inputPermuted = 10.;
|
|
outputPermuted = 2.;
|
|
|
|
nd4j::ops::col2im op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&input}, {&output}, {}, {sH, sW, pH, pW, iH, iW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
auto permStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&inputPermuted}, {&outputPermuted}, {}, {sH, sW, pH, pW, iH, iW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto permEnd = std::chrono::system_clock::now();
|
|
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
|
|
|
|
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
|
|
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, Test_Im2Col_3) {
|
|
|
|
int bS=16, iH=224,iW=224, iC=3,oC=3, kH=11,kW=11, sH=4,sW=4, pH=2,pW=2, dH=1,dW=1;
|
|
int oH=55, oW=55;
|
|
int iterations = 1;
|
|
|
|
auto output = NDArrayFactory::create<float>('c', {bS, iC, kH, kW, oH, oW});
|
|
auto input = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
|
|
|
|
auto outputPermuted = NDArrayFactory::create<float>('c', {bS, oH, oW, iC, kH, kW});
|
|
outputPermuted.permutei({0, 3, 4, 5, 1, 2});
|
|
auto inputPermuted = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
|
|
inputPermuted.permutei({0, 3, 1, 2});
|
|
|
|
input = 10.;
|
|
output = 2.;
|
|
|
|
inputPermuted = 10.;
|
|
outputPermuted = 2.;
|
|
|
|
nd4j::ops::im2col op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&input}, {&output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
auto permStart = std::chrono::system_clock::now();
|
|
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&inputPermuted}, {&outputPermuted}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 0}, {});
|
|
ASSERT_EQ(Status::OK(), result);
|
|
}
|
|
|
|
auto permEnd = std::chrono::system_clock::now();
|
|
auto permTime = std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
|
|
|
|
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
|
|
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, loop_test_1) {
|
|
|
|
if (1>0)
|
|
return;
|
|
|
|
auto f = NDArrayFactory::create<float>('c', {2}, {5000, 10000});
|
|
nd4j::ops::randomuniform op;
|
|
|
|
auto result = op.execute({&f}, {-1.0f, 1.0f}, {});
|
|
ASSERT_EQ(Status::OK(), result->status());
|
|
|
|
auto array = result->at(0);
|
|
|
|
auto buffer = array->buffer();
|
|
int cnt = 0;
|
|
int iterations = 1;
|
|
|
|
//nd4j_printf("Array length: %lld\n", array->lengthOf());
|
|
|
|
int length = (int) array->lengthOf();
|
|
int span = (int) (array->lengthOf() / 6) + 8;
|
|
|
|
NativeOps ops;
|
|
|
|
auto t = new int[1000000];
|
|
|
|
|
|
|
|
|
|
FloatBits fb;
|
|
float threshold = 0.99f;
|
|
fb.f_ = threshold;
|
|
int le = ops.estimateThreshold(nullptr, reinterpret_cast<void *>(array->buffer()), array->shapeInfo(), static_cast<int>(array->lengthOf()), threshold);
|
|
|
|
t[0] = le;
|
|
t[1] = length;
|
|
t[2] = fb.i_;
|
|
|
|
//nd4j_printf("number of elements: [%i]\n", le);
|
|
|
|
long permTime = 0;
|
|
|
|
for (int x = 0; x < iterations; x++) {
|
|
auto permStart = std::chrono::system_clock::now();
|
|
ops.estimateThreshold(nullptr, reinterpret_cast<void *>(array->buffer()), array->shapeInfo(), static_cast<int>(array->lengthOf()), threshold);
|
|
TypeCast::convertToThreshold<float>(nullptr, buffer, array->lengthOf(), t);
|
|
|
|
auto permEnd = std::chrono::system_clock::now();
|
|
permTime += std::chrono::duration_cast<std::chrono::microseconds> (permEnd - permStart).count();
|
|
}
|
|
|
|
|
|
|
|
nd4j_printf("Permuted time: %lld us; Counter: %i;\n", permTime / iterations, cnt);
|
|
|
|
delete result;
|
|
delete[] t;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, ndarray_tile_test1) {
|
|
|
|
auto x = NDArrayFactory::create<float>('c', {20, 30});
|
|
auto exp = NDArrayFactory::create<float>('c', {2,40,60});
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
auto tiled = x.tile({2,2,2});
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
// nd4j_printf("c-order time: %d;\n", time);
|
|
|
|
ASSERT_TRUE(tiled.isSameShape(&exp));
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, ndarray_tile_test2) {
|
|
|
|
auto x = NDArrayFactory::create<float>('f', {20, 30});
|
|
auto exp = NDArrayFactory::create<float>('f', {2,40,60});
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
auto tiled = x.tile({2,2,2});
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
// nd4j_printf("f-order time: %d;\n", time);
|
|
|
|
ASSERT_TRUE(tiled.isSameShape(&exp));
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loopThroughArrs_test1) {
|
|
|
|
NDArray x('c', {20, 30, 40}, nd4j::DataType::DOUBLE);
|
|
NDArray y('f', {50, 30, 4, 4}, nd4j::DataType::DOUBLE);
|
|
|
|
auto xBuff = x.bufferAsT<double>();
|
|
auto yBuff = y.bufferAsT<double>();
|
|
|
|
auto len = x.lengthOf();
|
|
|
|
//***********************************
|
|
//***********************************
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
|
for(Nd4jLong i = 0; i < len; ++i) {
|
|
|
|
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
|
Nd4jLong offset2 = shape::getIndexOffset(i, y.getShapeInfo(), len);
|
|
|
|
xBuff[offset1] = yBuff[offset2];
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
//***********************************
|
|
//***********************************
|
|
|
|
timeStart = std::chrono::system_clock::now();
|
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
|
for(Nd4jLong i = 0; i < len; ++i) {
|
|
|
|
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
|
Nd4jLong offset2 = shape::getIndexOffset(i, y.getShapeInfo(), len);
|
|
xBuff[offset1] = yBuff[offset2];
|
|
}
|
|
timeEnd = std::chrono::system_clock::now();
|
|
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
|
|
|
|
nd4j_printf("My time: %lld us;\n", myTime);
|
|
nd4j_printf("Old time: %lld us;\n", oldTime);
|
|
|
|
ASSERT_TRUE(1);
|
|
}
|
|
|
|
|
|
static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
|
|
|
|
auto len = shape::length(xShapeInfo);
|
|
int xEws = shape::elementWiseStride(xShapeInfo);
|
|
int yEws = shape::elementWiseStride(yShapeInfo);
|
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
|
|
|
BlockInformation info(len, ELEMENT_THRESHOLD);
|
|
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info.threads) if (info.threads > 1))
|
|
{
|
|
auto i = omp_get_thread_num();
|
|
Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder;
|
|
Nd4jLong index = i * info.items;
|
|
auto xi = x + xEws * index;
|
|
auto yi = y + yEws * index;
|
|
auto zi = z + zEws * index;
|
|
PRAGMA_OMP_SIMD
|
|
for (Nd4jLong j = 0; j < itemsToLoop; j++)
|
|
zi[j * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(xi[j * xEws], yi[j * yEws]);
|
|
}
|
|
}
|
|
|
|
static void loopSimple(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
|
|
|
|
auto len = shape::length(xShapeInfo);
|
|
int xEws = shape::elementWiseStride(xShapeInfo);
|
|
int yEws = shape::elementWiseStride(yShapeInfo);
|
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
|
int threads = 6;
|
|
int span_size = len / threads + 1;
|
|
|
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close))
|
|
for(Nd4jLong i = 0; i < len; ++i)
|
|
z[i * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(x[i * xEws], y[i * yEws]);
|
|
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loopThroughArrs_test2) {
|
|
|
|
NDArray x('c', {40, 25}, nd4j::DataType::FLOAT32);
|
|
|
|
const int iterations = 1;
|
|
const int arrays = 10;
|
|
|
|
std::vector<NDArray> arrs(arrays);
|
|
for(auto& arr : arrs)
|
|
arr = x;
|
|
|
|
//***********************************
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
srand(119);
|
|
for(Nd4jLong i = 0; i < iterations; ++i) {
|
|
int xInd = rand() % arrays;
|
|
int yInd = rand() % arrays;
|
|
int zInd = rand() % arrays;
|
|
auto xBuff = arrs[xInd].bufferAsT<float>();
|
|
auto yBuff = arrs[yInd].bufferAsT<float>();
|
|
auto zBuff = arrs[zInd].bufferAsT<float>();
|
|
auto xShapeInfo = arrs[xInd].getShapeInfo();
|
|
auto yShapeInfo = arrs[yInd].getShapeInfo();
|
|
auto zShapeInfo = arrs[zInd].getShapeInfo();
|
|
|
|
loopSimple(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto simpleTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
|
|
//***********************************
|
|
timeStart = std::chrono::system_clock::now();
|
|
for(Nd4jLong i = 0; i < iterations; ++i) {
|
|
int xInd = rand() % arrays;
|
|
int yInd = rand() % arrays;
|
|
int zInd = rand() % arrays;
|
|
auto xBuff = arrs[xInd].bufferAsT<float>();
|
|
auto yBuff = arrs[yInd].bufferAsT<float>();
|
|
auto zBuff = arrs[zInd].bufferAsT<float>();
|
|
auto xShapeInfo = arrs[xInd].getShapeInfo();
|
|
auto yShapeInfo = arrs[yInd].getShapeInfo();
|
|
auto zShapeInfo = arrs[zInd].getShapeInfo();
|
|
|
|
loopSpan(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
|
|
}
|
|
timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
|
|
nd4j_printf("simple time: %lld us;\n", simpleTime);
|
|
nd4j_printf("span time: %lld us;\n", spanTime);
|
|
|
|
ASSERT_TRUE(1);
|
|
}
|
|
|
|
static void loop1(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
|
|
|
|
auto len = shape::length(xShapeInfo);
|
|
int xEws = shape::elementWiseStride(xShapeInfo);
|
|
int yEws = shape::elementWiseStride(yShapeInfo);
|
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
|
|
|
nd4j::OmpLaunchHelper info(len);
|
|
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info._numThreads))
|
|
{
|
|
auto threadNum = omp_get_thread_num();
|
|
Nd4jLong threadOffset = info.getThreadOffset(threadNum);
|
|
PRAGMA_OMP_SIMD
|
|
for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++) {
|
|
Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len);
|
|
Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len);
|
|
Nd4jLong zOffset = shape::getIndexOffset(j+threadOffset, zShapeInfo, len);
|
|
z[xOffset] = simdOps::LogPoissonLoss<float, float, float>::op(x[xOffset], y[xOffset]);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void loop2(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo, float* z, Nd4jLong* zShapeInfo) {
|
|
|
|
auto len = shape::length(xShapeInfo);
|
|
int xEws = shape::elementWiseStride(xShapeInfo);
|
|
int yEws = shape::elementWiseStride(yShapeInfo);
|
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
|
int threads = 6;
|
|
int span_size = len / threads + 1;
|
|
|
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
|
|
for(Nd4jLong i = 0; i < len; ++i) {
|
|
Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
|
Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
|
Nd4jLong zOffset = shape::getIndexOffset(i, zShapeInfo, len);
|
|
z[xOffset] = simdOps::LogPoissonLoss<float, float, float>::op(x[xOffset], y[xOffset]);
|
|
}
|
|
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loopThroughArrs_test3) {
|
|
|
|
NDArray x('c', {50, 250}, nd4j::DataType::FLOAT32);
|
|
|
|
const int iterations = 1;
|
|
const int arrays = 100;
|
|
|
|
std::vector<NDArray> arrs(arrays);
|
|
for(auto& arr : arrs)
|
|
arr = x;
|
|
|
|
//***********************************
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
srand(119);
|
|
for(Nd4jLong i = 0; i < iterations; ++i) {
|
|
int xInd = rand() % arrays;
|
|
int yInd = rand() % arrays;
|
|
int zInd = rand() % arrays;
|
|
auto xBuff = arrs[xInd].bufferAsT<float>();
|
|
auto yBuff = arrs[yInd].bufferAsT<float>();
|
|
auto zBuff = arrs[zInd].bufferAsT<float>();
|
|
auto xShapeInfo = arrs[xInd].getShapeInfo();
|
|
auto yShapeInfo = arrs[yInd].getShapeInfo();
|
|
auto zShapeInfo = arrs[zInd].getShapeInfo();
|
|
|
|
loop2(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto simpleTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
|
|
//***********************************
|
|
timeStart = std::chrono::system_clock::now();
|
|
for(Nd4jLong i = 0; i < iterations; ++i) {
|
|
int xInd = rand() % arrays;
|
|
int yInd = rand() % arrays;
|
|
int zInd = rand() % arrays;
|
|
auto xBuff = arrs[xInd].bufferAsT<float>();
|
|
auto yBuff = arrs[yInd].bufferAsT<float>();
|
|
auto zBuff = arrs[zInd].bufferAsT<float>();
|
|
auto xShapeInfo = arrs[xInd].getShapeInfo();
|
|
auto yShapeInfo = arrs[yInd].getShapeInfo();
|
|
auto zShapeInfo = arrs[zInd].getShapeInfo();
|
|
|
|
loop1(xBuff, xShapeInfo, yBuff, yShapeInfo, zBuff, zShapeInfo);
|
|
}
|
|
timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
|
|
nd4j_printf("simpleTime time: %lld us;\n", simpleTime);
|
|
nd4j_printf("spanTime time: %lld us;\n", spanTime);
|
|
|
|
ASSERT_TRUE(1);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_batched_skipgram_1) {
|
|
const int batchSize = 64;
|
|
const int codeLen = 6;
|
|
const int numWords = 244;
|
|
const int vectorLength = 50;
|
|
|
|
auto target = NDArrayFactory::create<int>('c', {batchSize});
|
|
auto ngStarter = NDArrayFactory::empty<int>();
|
|
auto indices = NDArrayFactory::create<int>('c', {batchSize, codeLen});
|
|
auto codes = NDArrayFactory::create<int8_t>('c', {batchSize, codeLen});
|
|
auto syn0 = NDArrayFactory::create<float>('c', {numWords, vectorLength});
|
|
auto syn1 = NDArrayFactory::create<float>('c', {numWords, vectorLength});
|
|
auto syn1Neg = NDArrayFactory::empty<float>();
|
|
auto expTable = NDArrayFactory::linspace<float>(0.001, 0.995, 10000);
|
|
auto negTable = NDArrayFactory::empty<float>();
|
|
|
|
auto alpha = NDArrayFactory::create<double>('c', {batchSize});
|
|
auto randomValue = NDArrayFactory::create<Nd4jLong>('c', {batchSize});
|
|
auto inferenceVector = NDArrayFactory::empty<float>();
|
|
|
|
syn0.assign(0.01);
|
|
syn1.assign(0.02);
|
|
|
|
Nd4jLong rv = 2843242345121L;
|
|
auto lr = 0.025;
|
|
for (int e = 0; e < batchSize; e++) {
|
|
target.p(e, e);
|
|
alpha.p(e, lr);
|
|
randomValue.p(e, rv);
|
|
|
|
lr -= 0.001;
|
|
|
|
|
|
for (int s = 1; s < codeLen; s++) {
|
|
indices.p(e, s, nd4j::math::nd4j_abs<Nd4jLong>(rv % numWords));
|
|
codes.p(e, s, s % 2);
|
|
|
|
rv = nd4j::math::nd4j_abs<Nd4jLong>(rv * 25214903917L + 11);
|
|
}
|
|
|
|
rv = nd4j::math::nd4j_abs<Nd4jLong>(rv * 25214903917L + 11);
|
|
}
|
|
|
|
//indices.printIndexedBuffer("indices");
|
|
//codes.printIndexedBuffer("codes");
|
|
|
|
auto iterations = 1;
|
|
|
|
nd4j::ops::skipgram op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
auto result = op.execute({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, expTable, &negTable, &alpha, &randomValue, &inferenceVector}, {}, {}, {false}, true);
|
|
ASSERT_EQ(Status::OK(), result->status());
|
|
delete result;
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
|
|
|
|
nd4j_printf("average time: %lld us;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
|
|
|
|
delete expTable;
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, test_reduce_scalar_float_1) {
|
|
auto array = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
|
|
auto target = NDArrayFactory::create<float>(0.0f);
|
|
|
|
// warm up
|
|
for (int e = 0; e < 1; e++) {
|
|
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::Mean, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
|
|
int iterations = 1;
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::Mean, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
|
|
nd4j_printf("average time: %lld us;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_reduce_scalar_float_2) {
|
|
auto array = NDArrayFactory::create<float>('c', {100000});
|
|
auto target = NDArrayFactory::create<float>(0.0f);
|
|
|
|
// warm up
|
|
for (int e = 0; e < 1; e++) {
|
|
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::ReduceFloatBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
|
|
int iterations = 1;
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
NativeOpExecutioner::execReduceFloatScalar(LaunchContext::defaultContext(), reduce::ReduceFloatBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)/iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
|
|
nd4j_printf("average time: %lld ns;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_reduce_scalar_same_2) {
|
|
auto array = NDArrayFactory::create<float>('c', {100000});
|
|
auto target = NDArrayFactory::create<float>(0.0f);
|
|
|
|
// warm up
|
|
for (int e = 0; e < 1; e++) {
|
|
NativeOpExecutioner::execReduceSameScalar(LaunchContext::defaultContext(), reduce::ReduceSameBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
|
|
int iterations = 1;
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
NativeOpExecutioner::execReduceSameScalar(LaunchContext::defaultContext(), reduce::ReduceSameBenchmarkOp, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo());
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::nanoseconds> ((timeEnd - timeStart)/iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
|
|
nd4j_printf("average time: %lld ns;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, test_assign_float) {
|
|
// auto array = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
|
|
// auto target = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
|
|
|
|
auto array = NDArrayFactory::create<float>('c', {32, 64, 128, 128});
|
|
auto target = NDArrayFactory::create<float>('c', {32, 64, 128, 128});
|
|
|
|
array.assign(119);
|
|
|
|
// warm up
|
|
for (int e = 0; e < 5; e++) {
|
|
NativeOpExecutioner::execTransformAny(LaunchContext::defaultContext(), transform::Assign, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), nullptr, nullptr, nullptr);
|
|
}
|
|
|
|
int iterations = 1;
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
NativeOpExecutioner::execTransformAny(LaunchContext::defaultContext(), transform::Assign, array.buffer(), array.shapeInfo(), array.specialBuffer(), array.specialShapeInfo(), target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), nullptr, nullptr, nullptr);
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart)/iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
auto bw = (1000000L * (float) (array.lengthOf() * array.sizeOfT()) / spanTime) / 1024 / 1024 / 1024;
|
|
|
|
nd4j_printf("average time: %lld us;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
nd4j_printf("Bandwidth: %f GB/s\n", bw)
|
|
|
|
}
|
|
|
|
/*
|
|
TEST_F(PlaygroundTests, test_manual_loop) {
|
|
const unsigned int len = 32 * 128 * 256 * 256;
|
|
auto array = new float[len];
|
|
auto z = new float[len];
|
|
|
|
for (unsigned int e = 0; e < len; e++)
|
|
array[e] = (float) e;
|
|
|
|
const int iterations = 100;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int i = 0; i < iterations; i++) {
|
|
|
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(4) schedule(static, 32768))
|
|
for (unsigned int e = 0; e < len; e++)
|
|
z[e] = array[e];
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
auto bw = (1000000L * (float) (len * sizeof(float)) / spanTime) / 1024 / 1024 / 1024;
|
|
|
|
nd4j_printf("length: %i\n", len);
|
|
nd4j_printf("average time: %lld us;\n", spanTime);
|
|
nd4j_printf("total time: %lld ms;\n", ttlTime);
|
|
nd4j_printf("Bandwidth: %f GB/s\n", bw)
|
|
|
|
delete[] array;
|
|
delete[] z;
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_col2im_permuted_1) {
|
|
auto x = NDArrayFactory::create<float>('c', {8, 64, 55, 55, 3, 3});
|
|
x.assign(1.f);
|
|
x.permutei({0, 1, 4, 5, 2, 3});
|
|
|
|
auto z0 = NDArrayFactory::create<float>('c', {64, 8, 112, 112});
|
|
z0.permutei({1, 0, 2, 3});
|
|
|
|
auto z1 = NDArrayFactory::create<float>('c', {64, 8, 112, 112});
|
|
z1.permutei({1, 0, 2, 3});
|
|
|
|
nd4j_printf("Starting custom run...\n","");
|
|
const int iterations = 100;
|
|
nd4j::ops::col2im op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++) {
|
|
op.execute({&x}, {&z0}, {}, {2, 2, 0, 0, 112, 112, 1, 1, 1}, {});
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
|
|
nd4j_printf("Starting legacy run...\n","");
|
|
ExtraArguments arguments({2., 2., 0., 0., 112., 112., 1., 1.});
|
|
|
|
auto legacyStart = std::chrono::system_clock::now();
|
|
for (int e = 0; e < iterations; e++)
|
|
x.applyTransform(transform::Col2Im, &z1, &arguments);
|
|
|
|
auto legacyEnd = std::chrono::system_clock::now();
|
|
auto legacySpanTime = std::chrono::duration_cast<std::chrono::microseconds> ((legacyEnd - legacyStart) / iterations).count();
|
|
auto legacyTtlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((legacyEnd - legacyStart)).count();
|
|
|
|
nd4j_printf("average time: %lld us vs %lld us;\n", spanTime, legacySpanTime);
|
|
nd4j_printf("total time: %lld ms vs %lld ms;\n", ttlTime, legacyTtlTime);
|
|
|
|
ASSERT_EQ(z0, z1);
|
|
}
|
|
|
|
|
|
TEST_F(PlaygroundTests, test_addi_assign) {
|
|
int iterations = 1;
|
|
auto x = NDArrayFactory::create<float>('c', {1000000000});
|
|
auto z = NDArrayFactory::create<float>('c', {1000000000});
|
|
x.assign(119.0f);
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
x.applyScalar(scalar::Add,1.0f, &z, nullptr);
|
|
//z.assign(x);
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto spanTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / iterations).count();
|
|
auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
|
|
auto bw = (1000000L * (float) (x.lengthOf() * x.sizeOfT()) / spanTime) / 1024 / 1024 / 1024;
|
|
|
|
nd4j_printf("Avg add(1.0f) time: %lld us\n", spanTime);
|
|
nd4j_printf("Bandwidth: %f GB/s\n", bw);
|
|
}
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, conv2d_1) {
|
|
|
|
const int N = 100;
|
|
int bS=8, iH=64,iW=64, iC=32,oC=32, kH=2,kW=2, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
|
|
int paddingMode = 1; // 1-SAME, 0-VALID;
|
|
int dataFormat = 0; // 1-NHWC, 0-NCHW
|
|
|
|
NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
|
|
NDArray output(input);
|
|
NDArray weights('c', {kH, kW, iC, oC}, nd4j::DataType::FLOAT32);
|
|
NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
|
|
input = 2.;
|
|
weights.linspace(0.1, 0.1);
|
|
bias = 0.5;
|
|
|
|
nd4j::ops::conv2d op;
|
|
for (int i = 0; i < 10; i++)
|
|
100.5*0.5;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N; i++)
|
|
op.execute({&input, &weights, &bias}, {&output} , {}, {kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, dataFormat},{});
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
printf("duration %ld\n", duration);
|
|
}
|
|
*/
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, batchnorm_1) {
|
|
|
|
const int N = 1;
|
|
NDArray input ('c', {8, 32, 64, 64}, nd4j::DataType::FLOAT32);
|
|
NDArray output ('c', {8, 32, 64, 64}, nd4j::DataType::FLOAT32);
|
|
NDArray mean ('c', {32}, nd4j::DataType::FLOAT32);
|
|
NDArray variance('c', {32}, nd4j::DataType::FLOAT32);
|
|
NDArray gamma ('c', {32}, nd4j::DataType::FLOAT32);
|
|
NDArray beta ('c', {32}, nd4j::DataType::FLOAT32);
|
|
|
|
input = 10.5;
|
|
mean = 5.5;
|
|
variance = 1.5;
|
|
gamma = 0.5;
|
|
beta = 2.5;
|
|
|
|
nd4j::ops::batchnorm_new op;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
// for (int i = 0; i <N ; i++)
|
|
op.execute({&input, &mean, &variance, &gamma, &beta}, {&output}, {1e-5}, {1,1,1}, {});
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
|
|
printf("duration %ld\n", duration);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, softmax_1) {
|
|
|
|
const int N = 1;
|
|
NDArray input('c', {1024, 256}, nd4j::DataType::FLOAT32);
|
|
NDArray output('c', {1024, 256}, nd4j::DataType::FLOAT32);
|
|
|
|
input.linspace(-100., 0.01);
|
|
|
|
nd4j::ops::softmax op;
|
|
|
|
for (int i = 0; i < 20 ; i++)
|
|
100.5*100.5;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++)
|
|
op.execute({&input}, {&output}, {}, {1}, {});
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
printf("duration %ld\n", duration);
|
|
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, subarr_1) {
|
|
|
|
NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
|
|
NDArray subArr1 = x({0,0, 3,4});
|
|
NDArray subArr2 = x({0,0, 3,4}, true);
|
|
|
|
subArr1.printShapeInfo("subArr1");
|
|
subArr2.printShapeInfo("subArr2");
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, subarr_2) {
|
|
|
|
NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
|
|
auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
|
|
|
|
subArr1->printShapeInfo("subArr1");
|
|
|
|
ASSERT_EQ(5, subArr1->ews());
|
|
delete subArr1;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loops_1) {
|
|
/*
|
|
const int N = 1;
|
|
NDArray x('c', {16, 32, 64, 64}, nd4j::DataType::FLOAT32);
|
|
NDArray z1('c', {32}, nd4j::DataType::FLOAT32);
|
|
NDArray z2('c', {32}, nd4j::DataType::FLOAT32);
|
|
NDArray z3('c', {32}, nd4j::DataType::FLOAT32);
|
|
std::vector<int> dimsToExclude = {0,2,3};
|
|
std::vector<int> tadDims = {1};
|
|
x.linspace(0.01);
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i)
|
|
32*512;
|
|
|
|
auto timeStart1 = std::chrono::system_clock::now();
|
|
for (int i = 0; i < N ; i++)
|
|
x.reduceAlongDimension(nd4j::reduce::Mean, &z1, dimsToExclude);
|
|
auto timeEnd1 = std::chrono::system_clock::now();
|
|
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd1 - timeStart1) / N).count();
|
|
|
|
|
|
auto timeStartE = std::chrono::system_clock::now();
|
|
for (int i = 0; i < N ; i++)
|
|
x.reduceAlongDimension(nd4j::reduce::Sum, &z3, dimsToExclude);
|
|
auto timeEndE = std::chrono::system_clock::now();
|
|
auto durationE = std::chrono::duration_cast<std::chrono::microseconds> ((timeEndE - timeStartE) / N).count();
|
|
|
|
Nd4jLong *tadShapeInfo(nullptr), *tadOffsets(nullptr);
|
|
x.getSubArrShapeAndOffsets(tadDims, tadShapeInfo, tadOffsets);
|
|
|
|
// shape::printShapeInfoLinear(tadShapeInfo);
|
|
// shape::printIntArray(tadOffsets, 32);
|
|
|
|
auto timeStart2 = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++)
|
|
Loops::loopReduce<float, float, float>(x.bufferAsT<float>(), tadShapeInfo, tadOffsets,
|
|
z2.bufferAsT<float>(), z2.getShapeInfo(),
|
|
nullptr,
|
|
&simdOps::Mean<float,float>::startingValue,
|
|
&simdOps::Mean<float,float>::update,
|
|
&simdOps::Mean<float,float>::op,
|
|
&simdOps::Mean<float,float>::postProcess);
|
|
|
|
|
|
auto timeEnd2 = std::chrono::system_clock::now();
|
|
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd2 - timeStart2) / N).count();
|
|
|
|
RELEASE(tadShapeInfo, x.getWorkspace());
|
|
RELEASE(tadOffsets, x.getWorkspace());
|
|
|
|
// z1.printIndexedBuffer("z1 ");
|
|
// z2.printIndexedBuffer("z2 ");
|
|
|
|
ASSERT_TRUE(z1.equalsTo(z2));
|
|
|
|
printf("duration old: %ld\n", duration1);
|
|
printf("duration new: %ld\n", duration2);
|
|
printf("duration E: %ld\n", durationE);
|
|
*/
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, newTads_1) {
|
|
|
|
const int N = 1;
|
|
|
|
Nd4jLong shapeInfo[] = {4, 1024,1024,1024,1024, 1024*1024*1024,1024*1024,1024,1, 16384,1,99};
|
|
const int rank = shape::rank(shapeInfo);
|
|
const std::vector<int> dimsToExclude = {1,3};
|
|
const std::vector<int> tadDims = {0,2};
|
|
const bool keepUnitesInShape = false;
|
|
|
|
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(shapeInfo, dimsToExclude);
|
|
const int subArrRank = (rank == dimsToExclude.size() || keepUnitesInShape) ? rank : rank - dimsToExclude.size();
|
|
|
|
auto sPtr = new Nd4jLong[shape::shapeInfoLength(subArrRank)];
|
|
auto oPtr = new Nd4jLong[numOfSubArrs];
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i)
|
|
32*512;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (int i = 0; i < N ; i++)
|
|
auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(shapeInfo, tadDims);
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
|
|
printf("duration old: %ld\n", duration);
|
|
delete []sPtr;
|
|
delete []oPtr;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loops_2) {
|
|
|
|
const uint N = 5;
|
|
const Nd4jLong dim0(10), dim1(10), dim2(10);
|
|
|
|
const Nd4jLong shapeInfo[2*3+4] = {3, dim0,dim1,dim2, 1,dim0,dim0*dim1, 8192,1,102};
|
|
const Nd4jLong len = shape::length(shapeInfo);
|
|
float* buff = new float[len];
|
|
|
|
const Nd4jLong* shape = shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo));
|
|
const Nd4jLong* strides = shape::stride(const_cast<Nd4jLong*>(shapeInfo));
|
|
|
|
// OmpLaunchHelper threadsInfo(len);
|
|
|
|
Nd4jLong *xOffsets, *yOffsets, *zOffsets;
|
|
xOffsets = new Nd4jLong[len];
|
|
yOffsets = new Nd4jLong[len];
|
|
zOffsets = new Nd4jLong[len];
|
|
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i) 32*512;
|
|
|
|
|
|
//***********************************
|
|
//***********************************
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N; ++i)
|
|
{
|
|
|
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
|
{
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, xOffsets);
|
|
}
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, yOffsets);
|
|
}
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, zOffsets);
|
|
}
|
|
}
|
|
|
|
|
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
|
for (uint i = 0; i < len; i++)
|
|
buff[zOffsets[i]] = buff[xOffsets[i]] * buff[yOffsets[i]];
|
|
}
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N) .count();
|
|
|
|
//***********************************
|
|
//***********************************
|
|
|
|
timeStart = std::chrono::system_clock::now();
|
|
|
|
uint xShapeInfoCast[MAX_RANK];
|
|
uint yShapeInfoCast[MAX_RANK];
|
|
uint zShapeInfoCast[MAX_RANK];
|
|
|
|
bool canCastX = DataTypeUtils::castShapeInfo(shapeInfo, xShapeInfoCast);
|
|
bool canCastY = DataTypeUtils::castShapeInfo(shapeInfo, yShapeInfoCast);
|
|
bool canCastZ = DataTypeUtils::castShapeInfo(shapeInfo, zShapeInfoCast);
|
|
|
|
for (int i = 0; i < N; ++i)
|
|
{
|
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
|
for (uint i = 0; i < len; i++) {
|
|
|
|
auto xOffset = shape::indexOffset(i, shapeInfo, xShapeInfoCast, len, canCastX);
|
|
auto yOffset = shape::indexOffset(i, shapeInfo, yShapeInfoCast, len, canCastY);
|
|
auto zOffset = shape::indexOffset(i, shapeInfo, zShapeInfoCast, len, canCastZ);
|
|
buff[zOffset] = buff[xOffset] * buff[yOffset];
|
|
}
|
|
|
|
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
|
|
// for (uint i0 = 0; i0 < shape[0]; ++i0)
|
|
// for (uint i1 = 0; i1 < shape[1]; ++i1)
|
|
// for (uint i2 = 0; i2 < shape[2]; ++i2)
|
|
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
|
|
}
|
|
timeEnd = std::chrono::system_clock::now();
|
|
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
|
|
nd4j_printf("My time: %lld us;\n", myTime);
|
|
nd4j_printf("Old time: %lld us;\n", oldTime);
|
|
|
|
delete []xOffsets;
|
|
delete []yOffsets;
|
|
delete []zOffsets;
|
|
delete []buff;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loops_3) {
|
|
|
|
const uint N = 5;
|
|
// const Nd4jLong dim0(1024), dim1(1024), dim2(1024);
|
|
const Nd4jLong dim0(10), dim1(10), dim2(10);
|
|
|
|
const Nd4jLong shapeInfo[2*3+4] = {3, dim0,dim1,dim2, dim1*dim2,dim2,1, 8192,1,99};
|
|
const Nd4jLong len = shape::length(shapeInfo);
|
|
float* buff = new float[len];
|
|
|
|
const Nd4jLong* shape = shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo));
|
|
const Nd4jLong* strides = shape::stride(const_cast<Nd4jLong*>(shapeInfo));
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i) 32*512;
|
|
|
|
|
|
//***********************************
|
|
//***********************************
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
|
|
for (int i = 0; i < N; ++i)
|
|
{
|
|
Nd4jLong* idxX = new Nd4jLong[3];
|
|
Nd4jLong* idxY = new Nd4jLong[3];
|
|
Nd4jLong* idxZ = new Nd4jLong[3];
|
|
Nd4jLong* offsetPerDimX = new Nd4jLong[3];
|
|
Nd4jLong* offsetPerDimY = new Nd4jLong[3];
|
|
Nd4jLong* offsetPerDimZ = new Nd4jLong[3];
|
|
memset(idxX, 0, sizeof(Nd4jLong) * 3);
|
|
memset(idxY, 0, sizeof(Nd4jLong) * 3);
|
|
memset(idxZ, 0, sizeof(Nd4jLong) * 3);
|
|
|
|
PRAGMA_OMP_SIMD
|
|
for (int k = 0; k < 3; ++k) {
|
|
offsetPerDimX[k] = (shape[k] - 1) * strides[k];
|
|
offsetPerDimY[k] = (shape[k] - 1) * strides[k];
|
|
offsetPerDimZ[k] = (shape[k] - 1) * strides[k];
|
|
}
|
|
|
|
Nd4jLong initX(0), initY(0), initZ(0), offsetsX(0), offsetsY(0), offsetsZ(0);
|
|
Nd4jLong rankMinusOne(3 - 1), jX(rankMinusOne), jY(rankMinusOne), jZ(rankMinusOne);
|
|
|
|
// we do first iteration separately
|
|
buff[offsetsZ] = buff[offsetsX] * buff[offsetsY];
|
|
uint e = 1;
|
|
|
|
while (e < len) {
|
|
|
|
// printf("%lld, %lld, %lld\n", jX, jY, jZ);
|
|
if(shape[jX] == 1) { --jX; --jY; --jZ; continue; }
|
|
|
|
if(jX == rankMinusOne) { for(int l = 1; l < shape[jX]; ++l) {offsetsX += strides[jX]; ++e;} --jX; }
|
|
else if(idxX[jX] < shape[jX] - 1) {initX += strides[jX]; offsetsX = initX; ++idxX[jX]; jX = rankMinusOne; ++e;}
|
|
else {initX -= offsetPerDimX[jX]; idxX[jX--] = 0;}
|
|
|
|
if(jY == rankMinusOne) { for(int l = 1; l < shape[jY]; ++l) {offsetsY += strides[jY];} --jY; }
|
|
else if(idxY[jY] < shape[jY] - 1) {initY += strides[jY]; offsetsY = initY; ++idxY[jY]; jY = rankMinusOne; }
|
|
else {initY -= offsetPerDimY[jY]; idxY[jY--] = 0;}
|
|
|
|
if(jZ == rankMinusOne) { for(int l = 1; l < shape[jZ]; ++l) {offsetsZ += strides[jZ];} --jZ; }
|
|
else if(idxZ[jZ] < shape[jZ] - 1) {initZ += strides[jZ]; offsetsZ = initZ; ++idxZ[jZ]; jZ = rankMinusOne; }
|
|
else {initZ -= offsetPerDimZ[jZ]; idxZ[jZ--] = 0;}
|
|
|
|
buff[offsetsZ] = buff[offsetsX] * buff[offsetsY];
|
|
}
|
|
|
|
delete []idxX;
|
|
delete []idxY;
|
|
delete []idxZ;
|
|
delete []offsetPerDimX;
|
|
delete []offsetPerDimY;
|
|
delete []offsetPerDimZ;
|
|
|
|
}
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto myTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N) .count();
|
|
|
|
//***********************************
|
|
//***********************************
|
|
|
|
timeStart = std::chrono::system_clock::now();
|
|
|
|
// uint xShapeInfoCast[MAX_RANK];
|
|
// uint yShapeInfoCast[MAX_RANK];
|
|
// uint zShapeInfoCast[MAX_RANK];
|
|
|
|
// bool canCastX = DataTypeUtils::castShapeInfo(shapeInfo, xShapeInfoCast);
|
|
// bool canCastY = DataTypeUtils::castShapeInfo(shapeInfo, yShapeInfoCast);
|
|
// bool canCastZ = DataTypeUtils::castShapeInfo(shapeInfo, zShapeInfoCast);
|
|
|
|
// for (int i = 0; i < N; ++i)
|
|
// {
|
|
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
|
|
// for (uint i0 = 0; i0 < shape[0]; ++i0)
|
|
// for (uint i1 = 0; i1 < shape[1]; ++i1)
|
|
// for (uint i2 = 0; i2 < shape[2]; ++i2)
|
|
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
|
|
// }
|
|
Nd4jLong *xOffsets, *yOffsets, *zOffsets;
|
|
xOffsets = new Nd4jLong[len];
|
|
yOffsets = new Nd4jLong[len];
|
|
zOffsets = new Nd4jLong[len];
|
|
|
|
for (int i = 0; i < N; ++i)
|
|
{
|
|
|
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
|
{
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, xOffsets);
|
|
}
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, yOffsets);
|
|
}
|
|
PRAGMA_OMP_SECTION
|
|
{
|
|
|
|
shape::calcOffsets(3, shape, strides, zOffsets);
|
|
}
|
|
}
|
|
|
|
|
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
|
for (uint i = 0; i < len; i++)
|
|
buff[zOffsets[i]] = buff[xOffsets[i]] * buff[yOffsets[i]];
|
|
}
|
|
timeEnd = std::chrono::system_clock::now();
|
|
auto oldTime = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
|
|
|
|
delete []xOffsets;
|
|
delete []yOffsets;
|
|
delete []zOffsets;
|
|
|
|
nd4j_printf("My time: %lld us;\n", myTime);
|
|
nd4j_printf("Old time: %lld us;\n", oldTime);
|
|
|
|
delete []buff;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loops_4) {
|
|
|
|
const uint N = 2;
|
|
// const Nd4jLong dim0(256), dim1(256), dim2(256), dim3(256);
|
|
const Nd4jLong dim0(10), dim1(10), dim2(10), dim3(10);
|
|
NDArray x('c', {dim0, dim1, dim2, dim3});
|
|
NDArray z('c', {dim0, dim2});
|
|
|
|
x = 0.1;
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i) 32*512;
|
|
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (uint i = 0; i < N; ++i)
|
|
x.reduceAlongDimension(reduce::Sum, &z, {1,3});
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto myTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart) / N) .count();
|
|
nd4j_printf("My time: %lld us;\n", myTime);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, loops_5) {
|
|
|
|
const uint N = 2;
|
|
// const Nd4jLong dim0(1024), dim1(1024), dim2(256);
|
|
const Nd4jLong dim0(10), dim1(10), dim2(10);
|
|
NDArray x('c', {dim0, dim1, dim2});
|
|
NDArray z('c', {dim0, dim1, dim2});
|
|
|
|
// provide worst case
|
|
*shape::ews(x.shapeInfo()) = 0;
|
|
*shape::ews(z.shapeInfo()) = 0;
|
|
|
|
x = 0.1;
|
|
|
|
// warm up
|
|
for (int i = 0; i < 1000; ++i) 32*512;
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
for (uint i = 0; i < N; ++i)
|
|
x.applyTransform(transform::Log, &z);
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto myTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart) / N) .count();
|
|
nd4j_printf("My time: %lld us;\n", myTime);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, im2col_1) {
|
|
|
|
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
|
|
int bS=2, iH=4,iW=4, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
|
|
int oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1; // VALID
|
|
int oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1; // VALID
|
|
|
|
NDArray image('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
|
|
NDArray column('c', {bS, iC, kH, kW, oH, oW}, nd4j::DataType::FLOAT32);
|
|
|
|
nd4j::LaunchContext * context = image.getContext();
|
|
NDArray padValue (nd4j::DataType::FLOAT32, context); // scalar =0
|
|
|
|
image.linspace(1, 1);
|
|
|
|
const int N = 1;
|
|
|
|
// warm up
|
|
nd4j::ops::helpers::im2col(*context, image, column, kH, kW, sH, sW, pH, pW, dH, dW, padValue); // warm up
|
|
|
|
// ---------------------------------------- //
|
|
|
|
auto timeStart1 = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++) {
|
|
nd4j::ops::helpers::im2col(*context, image, column, kH, kW, sH, sW, pH, pW, dH, dW, padValue);
|
|
// FIXME: do not use cuda methods in generic code
|
|
//cudaStreamSynchronize(*context->getCudaStream());
|
|
}
|
|
|
|
auto timeEnd1 = std::chrono::system_clock::now();
|
|
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd1 - timeStart1) / N).count();
|
|
printf("duration my %ld\n", duration1);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, im2col_2) {
|
|
|
|
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
|
|
int bS=2, iH=4,iW=4, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
|
|
int oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1; // VALID
|
|
int oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1; // VALID
|
|
|
|
NDArray image('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
|
|
NDArray column('c', {bS, iC, kH, kW, oH, oW}, nd4j::DataType::FLOAT32);
|
|
|
|
nd4j::LaunchContext * context = image.getContext();
|
|
|
|
image.linspace(1, 1);
|
|
ExtraArguments extras(std::vector<double>({(double)kH, (double)kW, (double)sH, (double)sW, (double)pH, (double)pW, (double)dH, (double)dW, 0., 0.}));
|
|
|
|
const int N = 1;
|
|
|
|
// warm up
|
|
void* params = extras.argumentsAsT(column.dataType());
|
|
NativeOpExecutioner::execTransformSame(context, nd4j::transform::Im2col, image.buffer(), image.getShapeInfo(), image.getSpecialBuffer(), image.getSpecialShapeInfo(), column.buffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), params, nullptr, nullptr);
|
|
|
|
// ---------------------------------------- //
|
|
auto timeStart2 = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++) {
|
|
NativeOpExecutioner::execTransformSame(context, nd4j::transform::Im2col,
|
|
image.buffer(), image.getShapeInfo(), image.getSpecialBuffer(), image.getSpecialShapeInfo(),
|
|
column.buffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(),
|
|
params,
|
|
nullptr, nullptr);
|
|
}
|
|
|
|
auto timeEnd2 = std::chrono::system_clock::now();
|
|
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd2 - timeStart2) / N).count();
|
|
printf("duration old %ld\n", duration2);
|
|
}
|
|
/*
|
|
TEST_F(PlaygroundTests, test_scatter_119) {
|
|
auto output = NDArrayFactory::create<float>('c', {65536, 512});
|
|
auto updates = NDArrayFactory::create<float>('c', {65536, 512});
|
|
auto indices = NDArrayFactory::create<int>('c', {65536});
|
|
|
|
int p = 0;
|
|
for (int e = 65534; e >= 0; e--)
|
|
indices.p(p++, e);
|
|
|
|
indices.syncToDevice();
|
|
|
|
int N = 1;
|
|
|
|
auto timeStart1 = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++) {
|
|
helpers::scatter(LaunchContext::defaultContext(), pairwise::CopyPws, indices, updates, output, false);
|
|
// FIXME: do not use cuda methods in generic code
|
|
//cudaStreamSynchronize(*context->getCudaStream());
|
|
}
|
|
|
|
auto timeEnd1 = std::chrono::system_clock::now();
|
|
auto duration1 = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd1 - timeStart1) / N).count();
|
|
nd4j_printf("duration my %ld\n", duration1);
|
|
}
|
|
|
|
TEST_F(PlaygroundTests, test_scatter_120) {
|
|
auto output = NDArrayFactory::create_<float>('c', {65536, 512});
|
|
auto updates = NDArrayFactory::create_<float>('c', {65536, 512});
|
|
auto indices = NDArrayFactory::create_<int>('c', {65536});
|
|
|
|
int p = 0;
|
|
for (int e = 65534; e >= 0; e--)
|
|
indices->p(p++, e);
|
|
|
|
indices->syncToDevice();
|
|
|
|
int N = 1;
|
|
|
|
auto timeStart1 = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < N ; i++) {
|
|
helpers::scatter(LaunchContext::defaultContext(), pairwise::CopyPws, *indices, *updates, *output, false);
|
|
// FIXME: do not use cuda methods in generic code
|
|
//cudaStreamSynchronize(*context->getCudaStream());
|
|
}
|
|
|
|
auto timeEnd1 = std::chrono::system_clock::now();
|
|
auto duration1 = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd1 - timeStart1) / N).count();
|
|
nd4j_printf("duration my %ld\n", duration1);
|
|
|
|
delete output;
|
|
delete indices;
|
|
delete updates;
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
TEST_F(PlaygroundTests, mmulMxM_1) {
|
|
|
|
const int numOfIters = 100;
|
|
|
|
const Nd4jLong M = 1024;
|
|
const Nd4jLong K = 1024;
|
|
const Nd4jLong N = 1024;
|
|
|
|
NDArray a('f', {M,K}, nd4j::DataType::FLOAT32);
|
|
NDArray b('f', {K,N}, nd4j::DataType::FLOAT32);
|
|
NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
|
|
|
|
|
|
auto timeStart = std::chrono::system_clock::now();
|
|
|
|
for (int i = 0; i < numOfIters; ++i)
|
|
nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
|
|
|
|
auto timeEnd = std::chrono::system_clock::now();
|
|
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / numOfIters).count();
|
|
printf("duration %ld\n", duration1);
|
|
}
|
|
*/
|