1922 lines
73 KiB
C++
1922 lines
73 KiB
C++
/*******************************************************************************
|
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
*
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* License for the specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
******************************************************************************/
|
|
|
|
//
|
|
// @author raver119@gmail.com
|
|
//
|
|
|
|
#include <ops/declarable/CustomOperations.h>
|
|
#include <performance/benchmarking/FullBenchmarkSuit.h>
|
|
#include <ops/declarable/LegacyRandomOp.h>
|
|
#include <algorithm>
|
|
|
|
#ifdef RELEASE_BUILD
|
|
int wIterations = 4;
|
|
int rIterations = 20;
|
|
int gemmRegularUpperPow = 11;
|
|
int scalarBenchmarkPowLimit = 26;
|
|
int transformBenchmarkPowLimit = 26;
|
|
int intermediateTransformPowLimit = 22;
|
|
int intermediateTransformPowLimit2 = 18;
|
|
int pairwisePowLimit = 26;
|
|
int heavyPowLimit = 22;
|
|
int nonEwsPowLimit = 10;
|
|
int reduceScalarPowLimit = 26;
|
|
int stridedReductionPowLimit = 20;
|
|
int mismatchedAssignPowLimit = 26;
|
|
int gatherOpPowLimit = 18;
|
|
int gatherOpPowLimit2 = 16;
|
|
int gatherOpPowLimit3 = 12;
|
|
int broadcastMatrixRankLimit = 5;
|
|
int limit30 = 30;
|
|
int limit26 = 26;
|
|
int limit24 = 24;
|
|
int limit22 = 22;
|
|
int limit20 = 20;
|
|
int limit18 = 18;
|
|
int limit10 = 10;
|
|
int limit5 = 5;
|
|
int limit3 = 3;
|
|
#else
|
|
int wIterations = 0;
|
|
int rIterations = 1;
|
|
int gemmRegularUpperPow = 7;
|
|
int scalarBenchmarkPowLimit = 10;
|
|
int transformBenchmarkPowLimit = 10;
|
|
int intermediateTransformPowLimit = 10;
|
|
int intermediateTransformPowLimit2 = 10;
|
|
int pairwisePowLimit = 10;
|
|
int heavyPowLimit = 10;
|
|
int nonEwsPowLimit = 6;
|
|
int reduceScalarPowLimit = 10;
|
|
int stridedReductionPowLimit = 12;
|
|
int mismatchedAssignPowLimit = 2;
|
|
int gatherOpPowLimit = 10;
|
|
int gatherOpPowLimit2 = 8;
|
|
int gatherOpPowLimit3 = 8;
|
|
int broadcastMatrixRankLimit = 3;
|
|
int limit26 = 8;
|
|
int limit24 = 8;
|
|
int limit22 = 8;
|
|
int limit20 = 8;
|
|
int limit18 = 8;
|
|
int limit10 = 4;
|
|
int limit5 = 3;
|
|
int limit3 = 1;
|
|
#endif
|
|
|
|
namespace nd4j {
|
|
|
|
static std::string layerNormBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
BoolParameters nhwc("nhwc"); //0 = nchw
|
|
|
|
#ifdef _RELEASE
|
|
int c = 32;
|
|
int hw = 64;
|
|
#else
|
|
int c = 3;
|
|
int hw = 8;
|
|
#endif
|
|
|
|
ParametersBatch batch({&nhwc});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int n = p.getIntParam("nhwc");
|
|
|
|
int axis;
|
|
if (n == 0) {
|
|
//nchw
|
|
auto input = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
|
|
auto output = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
axis = 1;
|
|
} else {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
axis = 3;
|
|
}
|
|
|
|
auto bias = NDArrayFactory::create_<float>('c', {c});
|
|
ctx->setInputArray(1, bias, true);
|
|
auto iargs = new Nd4jLong[1];
|
|
iargs[0] = axis;
|
|
ctx->setIArguments(iargs, 1);
|
|
delete[] iargs;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::layer_norm layerNorm;
|
|
DeclarableBenchmark benchmark(layerNorm, "layer norm");
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "Layer Norm");
|
|
|
|
return output;
|
|
}
|
|
|
|
|
|
static std::string maxPool3DBenchmark(){
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
BoolParameters ncdhw("ncdhw"); //1 = ndhwc
|
|
ParametersBatch batch({&ncdhw});
|
|
|
|
nd4j::ops::maxpool3dnew maxpool3Dnew;
|
|
DeclarableBenchmark benchmark(maxpool3Dnew, "maxPool3d");
|
|
|
|
#ifdef _RELEASE
|
|
int mb = 16;
|
|
int chIn = 16;
|
|
int chOut = 16;
|
|
int dhw = 64;
|
|
#else
|
|
int mb = 1;
|
|
int chIn = 3;
|
|
int chOut = 3;
|
|
int dhw = 16;
|
|
#endif
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int format = p.getIntParam("ncdhw");
|
|
|
|
//Set inputs and outputs
|
|
//Same mode + stride 1: output is same shape as input
|
|
if(format == 1) {
|
|
//NDHWC
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
|
|
} else {
|
|
//NCDHW
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
|
|
}
|
|
|
|
auto iargs = new Nd4jLong[15];
|
|
//Kernel, strides, padding, dilation - x3 each
|
|
iargs[0] = 3; //Kernel
|
|
iargs[1] = 3;
|
|
iargs[2] = 3;
|
|
iargs[3] = 1; //Stride
|
|
iargs[4] = 1;
|
|
iargs[5] = 1;
|
|
iargs[6] = 0; //Padding
|
|
iargs[7] = 0;
|
|
iargs[8] = 0;
|
|
iargs[9] = 1; //Dilation
|
|
iargs[10] = 1;
|
|
iargs[11] = 1;
|
|
iargs[12] = 1; //Same mode
|
|
iargs[13] = 0; //Unused for max
|
|
iargs[14] = format; //0 = ncdhw
|
|
ctx->setIArguments(iargs, 14);
|
|
delete[] iargs;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "maxPool3d");
|
|
return output;
|
|
}
|
|
|
|
|
|
static std::string conv3dBenchmark(){
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
BoolParameters ncdhw("ncdhw"); //1 = ndhwc
|
|
ParametersBatch batch({&ncdhw});
|
|
|
|
nd4j::ops::conv3dnew conv3Dnew;
|
|
DeclarableBenchmark benchmark(conv3Dnew, "conv3d");
|
|
|
|
#ifdef _RELEASE
|
|
int mb = 16;
|
|
int chIn = 16;
|
|
int chOut = 16;
|
|
int dhw = 64;
|
|
#else
|
|
int mb = 1;
|
|
int chIn = 3;
|
|
int chOut = 3;
|
|
int dhw = 16;
|
|
#endif
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int format = p.getIntParam("ncdhw");
|
|
|
|
//Set inputs and outputs
|
|
//Same mode + stride 1: output is same shape as input
|
|
if(format == 1) {
|
|
//NDHWC
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
|
|
} else {
|
|
//NCDHW
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
|
|
}
|
|
|
|
//Weights and bias:
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {3, 3, 3, chIn, chOut}), true);
|
|
ctx->setInputArray(2, NDArrayFactory::create_<float>('c', {chOut}), true);
|
|
|
|
|
|
auto iargs = new Nd4jLong[14];
|
|
//Kernel, strides, padding, dilation - x3 each
|
|
iargs[0] = 3; //Kernel
|
|
iargs[1] = 3;
|
|
iargs[2] = 3;
|
|
iargs[3] = 1; //Stride
|
|
iargs[4] = 1;
|
|
iargs[5] = 1;
|
|
iargs[6] = 0; //Padding
|
|
iargs[7] = 0;
|
|
iargs[8] = 0;
|
|
iargs[9] = 1; //Dilation
|
|
iargs[10] = 1;
|
|
iargs[11] = 1;
|
|
iargs[12] = 1; //Same mode
|
|
iargs[13] = format; //0 = ncdhw
|
|
ctx->setIArguments(iargs, 14);
|
|
delete[] iargs;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "CNN3D");
|
|
return output;
|
|
}
|
|
|
|
|
|
static std::string lstmBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
BoolParameters format("format"); //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
|
|
#ifdef _RELEASE
|
|
PredefinedParameters mb("mb", {1, 8, 64});
|
|
PredefinedParameters nInOut("nInOut", {32, 256, 1024});
|
|
#else
|
|
PredefinedParameters mb("mb", {1});
|
|
PredefinedParameters nInOut("nInOut", {32});
|
|
#endif
|
|
|
|
ParametersBatch batch({&format, &mb, &nInOut});
|
|
nd4j::ops::lstmBlock lstmBlock;
|
|
DeclarableBenchmark benchmark(lstmBlock, "lstm");
|
|
|
|
int seqLength = 32;
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int f = p.getIntParam("format");
|
|
int m = p.getIntParam("mb");
|
|
int n = p.getIntParam("nInOut");
|
|
|
|
Nd4jLong l = 0;
|
|
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l), true); //Max TS length (unused)
|
|
|
|
|
|
if (f == 0) {
|
|
//TNS format
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //x
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //i
|
|
ctx->setOutputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //c
|
|
ctx->setOutputArray(2, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //f
|
|
ctx->setOutputArray(3, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //o
|
|
ctx->setOutputArray(4, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //z
|
|
ctx->setOutputArray(5, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //h
|
|
ctx->setOutputArray(6, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //y
|
|
} else {
|
|
//NST format
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //x
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //i
|
|
ctx->setOutputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //c
|
|
ctx->setOutputArray(2, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //f
|
|
ctx->setOutputArray(3, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //o
|
|
ctx->setOutputArray(4, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //z
|
|
ctx->setOutputArray(5, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //h
|
|
ctx->setOutputArray(6, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //y
|
|
}
|
|
|
|
auto cLast = NDArrayFactory::create_<float>('c', {m, n});
|
|
auto yLast = NDArrayFactory::create_<float>('c', {m, n});
|
|
auto W = NDArrayFactory::create_<float>('c', {2 * n, 4 * n});
|
|
auto Wci = NDArrayFactory::create_<float>('c', {n});
|
|
auto Wcf = NDArrayFactory::create_<float>('c', {n});
|
|
auto Wco = NDArrayFactory::create_<float>('c', {n});
|
|
auto b = NDArrayFactory::create_<float>('c', {4 * n});
|
|
|
|
ctx->setInputArray(2, cLast, true);
|
|
ctx->setInputArray(3, yLast, true);
|
|
ctx->setInputArray(4, W, true);
|
|
ctx->setInputArray(5, Wci, true);
|
|
ctx->setInputArray(6, Wcf, true);
|
|
ctx->setInputArray(7, Wco, true);
|
|
ctx->setInputArray(8, b, true);
|
|
|
|
auto iargs = new Nd4jLong[2];
|
|
iargs[0] = 0; //No peephole
|
|
iargs[1] = f;
|
|
ctx->setIArguments(iargs, 2);
|
|
delete[] iargs;
|
|
|
|
auto targs = new double[2];
|
|
targs[0] = 1.0; //forget bias
|
|
targs[1] = 0.0; //cell clipping value
|
|
ctx->setTArguments(targs, 2);
|
|
delete[] targs;
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
|
|
return output;
|
|
}
|
|
|
|
static std::string batchnormBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Convolution2D op
|
|
BoolParameters nhwc("nhwc");
|
|
#ifdef _RELEASE
|
|
PredefinedParameters c("c", {3, 32, 128});
|
|
PredefinedParameters hw("hw", {32, 128});
|
|
#else
|
|
PredefinedParameters c("c", {3});
|
|
PredefinedParameters hw("hw", {16});
|
|
#endif
|
|
|
|
ParametersBatch batch({&nhwc, &c, &hw});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int n = p.getIntParam("nhwc");
|
|
int hw = p.getIntParam("hw");
|
|
int ch = p.getIntParam("c");
|
|
|
|
auto args = new Nd4jLong[3];
|
|
args[0] = args[1] = 1; //apply scale and offset
|
|
if (n == 0) {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
args[2] = 1; //axis
|
|
} else {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
args[2] = 3; //axis
|
|
}
|
|
ctx->setIArguments(args, 3);
|
|
delete[] args;
|
|
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {ch}), true); //mean
|
|
auto v = NDArrayFactory::create_<float>('c', {ch});
|
|
v->assign(1.0f);
|
|
ctx->setInputArray(2, v, true); //variance
|
|
auto g = NDArrayFactory::create_<float>('c', {ch});
|
|
g->assign(1.0);
|
|
ctx->setInputArray(3, g, true); //gamma
|
|
auto b = NDArrayFactory::create_<float>('c', {ch});
|
|
b->assign(1.0);
|
|
ctx->setInputArray(4, b, true); //beta
|
|
|
|
auto targs = new double[1];
|
|
targs[0] = 1e-5;
|
|
ctx->setTArguments(targs, 1);
|
|
delete[] targs;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::batchnorm batchnorm;
|
|
DeclarableBenchmark benchmark(batchnorm, "batchnorm");
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "Batch Normalization");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string pool2dBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Convolution2D op
|
|
BoolParameters nhwc("nhwc");
|
|
#ifdef _RELEASE
|
|
PredefinedParameters k("k", {2, 3, 5});
|
|
PredefinedParameters c("c", {3, 32, 128});
|
|
PredefinedParameters hw("hw", {32, 128});
|
|
#else
|
|
PredefinedParameters k("k", {2});
|
|
PredefinedParameters c("c", {3});
|
|
PredefinedParameters hw("hw", {8});
|
|
#endif
|
|
|
|
ParametersBatch batch({&nhwc, &k, &c, &hw});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int n = p.getIntParam("nhwc");
|
|
int hw = p.getIntParam("hw");
|
|
int khw = p.getIntParam("k");
|
|
|
|
if (n == 0) {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
} else {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
}
|
|
|
|
auto args = new Nd4jLong[11];
|
|
args[0] = args[1] = khw; //Kernel
|
|
args[2] = args[3] = 1;//Stride
|
|
args[4] = args[5] = 0; //Pad
|
|
args[6] = args[7] = 1; //Dilation
|
|
args[8] = 1; //SAME
|
|
args[9] = 0; //Divisor mode - 0 = exclude padding in divisor
|
|
args[10] = n;//0-nchw, 1=nhwc
|
|
ctx->setIArguments(args, 11);
|
|
delete[] args;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::avgpool2d avgpool2d;
|
|
DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
|
|
output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pooling 2d Operation");
|
|
|
|
nd4j::ops::maxpool2d maxpool2d;
|
|
DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
|
|
output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pooling 2d Operation");
|
|
return output;
|
|
}
|
|
|
|
static std::string conv2dBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Convolution2D op
|
|
BoolParameters nhwc("nhwc");
|
|
#ifdef _RELEASE
|
|
PredefinedParameters k("k", {2, 3, 5});
|
|
PredefinedParameters c("c", {3, 32, 128});
|
|
PredefinedParameters hw("hw", {32, 128});
|
|
#else
|
|
PredefinedParameters k("k", {2});
|
|
PredefinedParameters c("c", {3});
|
|
PredefinedParameters hw("hw", {8});
|
|
#endif
|
|
ParametersBatch batch({&nhwc, &k, &c, &hw});
|
|
nd4j::ops::conv2d conv2d;
|
|
DeclarableBenchmark benchmark(conv2d, "conv2d");
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int n = p.getIntParam("nhwc");
|
|
int hw = p.getIntParam("hw");
|
|
int khw = p.getIntParam("k");
|
|
|
|
if (n == 0) {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
} else {
|
|
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
|
|
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
|
|
ctx->setInputArray(0, input, true);
|
|
ctx->setOutputArray(0, output, true);
|
|
}
|
|
|
|
auto b = NDArrayFactory::create_<float>('c', {p.getIntParam("c")});
|
|
auto w = NDArrayFactory::create_<float>('c', {khw, khw, p.getIntParam("c"), p.getIntParam("c")}); // [kH, kW, iC, oC] always
|
|
|
|
ctx->setInputArray(1, w, true);
|
|
ctx->setInputArray(2, b, true);
|
|
|
|
auto args = new Nd4jLong[10];
|
|
args[0] = args[1] = khw; //Kernel
|
|
args[2] = args[3] = 1;//Stride
|
|
args[4] = args[5] = 0; //Pad
|
|
args[6] = args[7] = 1; //Dilation
|
|
args[8] = 1; //SAME
|
|
args[9] = n;//0-nchw, 1=nhwc
|
|
ctx->setIArguments(args, 10);
|
|
delete[] args;
|
|
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d Operation");
|
|
return output;
|
|
}
|
|
|
|
static std::string rngBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
//Uniform, gaussian and bernoulli RNG generation
|
|
|
|
IntPowerParameters length("length", 2, 4, scalarBenchmarkPowLimit, 3); //2^8 to 2^30 in steps of 3
|
|
|
|
ParametersBatch batch({&length});
|
|
|
|
auto gen01 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
|
|
auto d = new double[2];
|
|
d[0] = 0.0;
|
|
d[1] = 1.0;
|
|
ctx->setTArguments(d, 2);
|
|
delete[] d;
|
|
return ctx;
|
|
};
|
|
|
|
auto gen05 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
|
|
auto d = new double[1];
|
|
d[0] = 0.5;
|
|
ctx->setTArguments(d, 1);
|
|
delete[] d;
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::LegacyRandomOp unif(random::UniformDistribution);
|
|
DeclarableBenchmark dbU(unif, "uniform");
|
|
output += helper.runOperationSuit(&dbU, gen01, batch, "Uniform Distribution");
|
|
|
|
nd4j::ops::LegacyRandomOp gaussian(random::GaussianDistribution);
|
|
DeclarableBenchmark dbG(gaussian, "gaussian");
|
|
output += helper.runOperationSuit(&dbG, gen01, batch, "Gaussian Distribution");
|
|
|
|
nd4j::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution);
|
|
DeclarableBenchmark dbTU(unif, "trunc.norm");
|
|
output += helper.runOperationSuit(&dbTU, gen01, batch, "Truncated Normal Distribution");
|
|
|
|
nd4j::ops::LegacyRandomOp ln(random::LogNormalDistribution);
|
|
DeclarableBenchmark dbLN(ln, "uniform");
|
|
output += helper.runOperationSuit(&dbLN, gen01, batch, "Log Normal Distribution");
|
|
|
|
nd4j::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution);
|
|
DeclarableBenchmark dbB(bernoulli, "bernoulli");
|
|
output += helper.runOperationSuit(&dbB, gen05, batch, "Bernoulli Distribution");
|
|
|
|
nd4j::ops::LegacyRandomOp dropout(random::BernoulliDistribution);
|
|
DeclarableBenchmark dbD(dropout, "dropout");
|
|
output += helper.runOperationSuit(&dbD, gen05, batch, "Dropout");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string gemmIrregularBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Basically the same as above, but with irregular shapes (not multiples of 8, etc)
|
|
|
|
#ifdef _RELEASE
|
|
int tAMax = 1;
|
|
int tBMax = 1;
|
|
int b = 1024;
|
|
int c = 1024;
|
|
#else
|
|
int tAMax = 1;
|
|
int tBMax = 1;
|
|
int b = 32;
|
|
int c = 32;
|
|
#endif
|
|
|
|
for (int tA = 0; tA <= tAMax; tA++) {
|
|
for (int tB = 0; tB <= tBMax; tB++) {
|
|
IntParameters d("d", 1020, 1028, 1); //1020, 1021, ..., 1028
|
|
ParametersBatch dim({&d});
|
|
|
|
//Vary A.rows:
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto a = p.getIntParam("d");
|
|
std::vector<Nd4jLong> shapeA;
|
|
std::vector<Nd4jLong> shapeB;
|
|
if (tA) {
|
|
shapeA = {b, a};
|
|
} else {
|
|
shapeA = {a, b};
|
|
}
|
|
if (tB) {
|
|
shapeB = {c, b};
|
|
} else {
|
|
shapeB = {b, c};
|
|
}
|
|
auto A = NDArrayFactory::create_<float>('c', shapeA);
|
|
auto B = NDArrayFactory::create_<float>('c', shapeB);
|
|
auto C = NDArrayFactory::create_<float>('f', {a, c});
|
|
|
|
x.push_back(A);
|
|
y.push_back(B);
|
|
z.push_back(C);
|
|
};
|
|
|
|
std::string n;
|
|
n += "Gemm (a.rows) - tA=";
|
|
n += std::to_string(tA);
|
|
n += ", tB=";
|
|
n += std::to_string(tB);
|
|
|
|
MatrixBenchmark mb(1.0, 0.0, tA, tB, n);
|
|
|
|
output += helper.runOperationSuit(&mb, generator, dim, n.c_str());
|
|
|
|
//Vary A.columns / B.rows
|
|
auto generator2 = PARAMETRIC_XYZ() {
|
|
auto a = 1024;
|
|
auto b = p.getIntParam("d");
|
|
auto c = 1024;
|
|
std::vector<Nd4jLong> shapeA;
|
|
std::vector<Nd4jLong> shapeB;
|
|
if (tA) {
|
|
shapeA = {b, a};
|
|
} else {
|
|
shapeA = {a, b};
|
|
}
|
|
if (tB) {
|
|
shapeB = {c, b};
|
|
} else {
|
|
shapeB = {b, c};
|
|
}
|
|
auto A = NDArrayFactory::create_<float>('c', shapeA);
|
|
auto B = NDArrayFactory::create_<float>('c', shapeB);
|
|
auto C = NDArrayFactory::create_<float>('f', {a, c});
|
|
|
|
x.push_back(A);
|
|
y.push_back(B);
|
|
z.push_back(C);
|
|
};
|
|
|
|
std::string n2;
|
|
n2 += "Gemm (a.columns) - tA=";
|
|
n2 += std::to_string(tA);
|
|
n2 += ", tB=";
|
|
n2 += std::to_string(tB);
|
|
|
|
MatrixBenchmark mb2(1.0, 0.0, tA, tB, n2);
|
|
|
|
output += helper.runOperationSuit(&mb2, generator2, dim, n2.c_str());
|
|
|
|
//Vary A.columns / B.rows
|
|
auto generator3 = PARAMETRIC_XYZ() {
|
|
auto a = 1024;
|
|
auto b = 1024;
|
|
auto c = p.getIntParam("d");
|
|
std::vector<Nd4jLong> shapeA;
|
|
std::vector<Nd4jLong> shapeB;
|
|
if (tA) {
|
|
shapeA = {b, a};
|
|
} else {
|
|
shapeA = {a, b};
|
|
}
|
|
if (tB) {
|
|
shapeB = {c, b};
|
|
} else {
|
|
shapeB = {b, c};
|
|
}
|
|
auto A = NDArrayFactory::create_<float>('c', shapeA);
|
|
auto B = NDArrayFactory::create_<float>('c', shapeB);
|
|
auto C = NDArrayFactory::create_<float>('f', {a, c});
|
|
|
|
x.push_back(A);
|
|
y.push_back(B);
|
|
z.push_back(C);
|
|
};
|
|
|
|
std::string n3;
|
|
n3 += "Gemm (b.columns) - tA=";
|
|
n3 += std::to_string(tA);
|
|
n3 += ", tB=";
|
|
n3 += std::to_string(tB);
|
|
|
|
MatrixBenchmark mb3(1.0, 0.0, tA, tB, n);
|
|
|
|
output += helper.runOperationSuit(&mb3, generator3, dim, n3.c_str());
|
|
}
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string batchGemmBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Rank 3 - [32,1024,1024]x[32,1024,1024]
|
|
//Rank 4 - [4,8,1024,1024]x[4,8,1024,1024]
|
|
|
|
IntParameters rank("rank", 3, 4, 1);
|
|
|
|
ParametersBatch b({&rank});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto rank = p.getIntParam("rank");
|
|
std::vector<Nd4jLong> shapeA;
|
|
std::vector<Nd4jLong> shapeB;
|
|
auto ctx = new Context(1);
|
|
|
|
if(rank == 3){
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
|
|
} else {
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
|
|
}
|
|
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::matmul mmul;
|
|
DeclarableBenchmark benchmark(mmul, "mmul (batch)");
|
|
output += helper.runOperationSuit(&benchmark, generator, b, "MMul (batch)");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string gemmRegularBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
for (int o = 0; o <= 1; o++) {
|
|
char resultOrder = (o == 0 ? 'f' : 'c');
|
|
for (int tA = 0; tA <= 1; tA++) {
|
|
for (int tB = 0; tB <= 1; tB++) {
|
|
|
|
|
|
IntPowerParameters pa("sz", 2, 7, gemmRegularUpperPow, 2); //2^7=128, 2^9=512, 2^11=2048
|
|
|
|
ParametersBatch b({&pa});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto s = p.getIntParam("sz");
|
|
auto A = NDArrayFactory::create_<float>('c', {s, s});
|
|
auto B = NDArrayFactory::create_<float>('c', {s, s});
|
|
auto C = NDArrayFactory::create_<float>(resultOrder, {s, s});
|
|
|
|
x.push_back(A);
|
|
y.push_back(B);
|
|
z.push_back(C);
|
|
};
|
|
|
|
std::string n;
|
|
n += "Gemm - tA=";
|
|
n += std::to_string(tA);
|
|
n += ", tB=";
|
|
n += std::to_string(tB);
|
|
n += ", cOrder=";
|
|
n += resultOrder;
|
|
|
|
MatrixBenchmark mb(1.0, 0.0, tA == 0 ? false : true, tB == 0 ? false : true, n);
|
|
|
|
output += helper.runOperationSuit(&mb, generator, b, n.c_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string scatterOpBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^26 in steps of 4
|
|
ParametersBatch batch({&length});
|
|
|
|
//Gather 1D tests - 1d ref, 1d indices, 1d updates -> 1d output
|
|
nd4j::ops::scatter_upd scatter_update1;
|
|
DeclarableBenchmark sa1d(scatter_update1, "scatter_update1d");
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int length = p.getIntParam("length");
|
|
auto in = NDArrayFactory::create_<float>('c', {length});
|
|
auto indices = NDArrayFactory::create_<int>('c', {length});
|
|
auto updates = NDArrayFactory::create_<float>('c', {length});
|
|
|
|
int* a = new int[length];
|
|
for( int i=0; i<length; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + length-1));
|
|
for( int i=0; i<length; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setInputArray(2, updates, true);
|
|
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
|
|
ctx->markInplace(true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&sa1d, generator, batch, "Scatter Update - 1d");
|
|
|
|
//Gather 2D tests - 2d input, 1d indices, 2d updates -> 2d output
|
|
IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^16 in steps of 2: 2^10, ..., 2^20
|
|
PredefinedParameters cols("cols", {32});
|
|
ParametersBatch batch2({&rows, &cols});
|
|
nd4j::ops::scatter_upd scatter_update2;
|
|
DeclarableBenchmark sa2d(scatter_update2, "scatter_update2d");
|
|
auto generator2 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int rows = p.getIntParam("rows");
|
|
int cols = p.getIntParam("cols");
|
|
auto in = NDArrayFactory::create_<float>('c', {rows, cols});
|
|
auto indices = NDArrayFactory::create_<int>('c', {rows});
|
|
auto updates = NDArrayFactory::create_<float>('c', {rows, cols});
|
|
|
|
int* a = new int[rows];
|
|
for( int i=0; i<rows; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + rows-1));
|
|
for( int i=0; i<rows; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setInputArray(2, updates, true);
|
|
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
|
|
ctx->markInplace(true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&sa2d, generator2, batch2, "Scatter Update - 2d");
|
|
|
|
//Gather 3D tests - 3d input, 1d indices -> 3d output
|
|
IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);
|
|
PredefinedParameters sz1("sz1", {32});
|
|
ParametersBatch batch3({&sz0, &sz1});
|
|
nd4j::ops::scatter_upd scatter_update3;
|
|
DeclarableBenchmark sa3d(scatter_update3, "scatter3d");
|
|
auto generator3 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int sz0 = p.getIntParam("sz0");
|
|
int sz1 = p.getIntParam("sz1");
|
|
auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
|
|
auto indices = NDArrayFactory::create_<int>('c', {sz0});
|
|
auto updates = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
|
|
|
|
int* a = new int[sz0];
|
|
for( int i=0; i<sz0; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + sz0-1));
|
|
for( int i=0; i<sz0; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setInputArray(2, updates, true);
|
|
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
|
|
ctx->markInplace(true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&sa3d, generator3, batch3, "Scatter Update - 3d");
|
|
return output;
|
|
}
|
|
|
|
static std::string gatherOpBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^22 in steps of 4
|
|
ParametersBatch batch({&length});
|
|
|
|
//Gather 1D tests - 1d input, 1d indices -> 1d output
|
|
nd4j::ops::gather gather1;
|
|
DeclarableBenchmark gather1d(gather1, "gather1d");
|
|
auto generator = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int length = p.getIntParam("length");
|
|
auto in = NDArrayFactory::create_<float>('c', {length});
|
|
auto indices = NDArrayFactory::create_<int>('c', {length});
|
|
int* a = new int[length];
|
|
for( int i=0; i<length; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + length-1));
|
|
for( int i=0; i<length; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {length}), true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&gather1d, generator, batch, "Gather - 1d");
|
|
|
|
//Gather 2D tests - 2d input, 1d indices -> 2d output
|
|
IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^20 in steps of 2: 2^10, ..., 2^20
|
|
PredefinedParameters cols("cols", {32});
|
|
ParametersBatch batch2({&rows, &cols});
|
|
nd4j::ops::gather gather2;
|
|
DeclarableBenchmark gather2d(gather2, "gather2d");
|
|
auto generator2 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int rows = p.getIntParam("rows");
|
|
int cols = p.getIntParam("cols");
|
|
auto in = NDArrayFactory::create_<float>('c', {rows, cols});
|
|
auto indices = NDArrayFactory::create_<int>('c', {rows});
|
|
|
|
int* a = new int[rows];
|
|
for( int i=0; i<rows; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + rows-1));
|
|
for( int i=0; i<rows; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, cols}), true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&gather2d, generator2, batch2, "Gather - 2d");
|
|
|
|
//Gather 3D tests - 3d input, 1d indices -> 3d output
|
|
IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4); //2^8 to 2^16 in steps of 4
|
|
PredefinedParameters sz1("sz1", {32});
|
|
ParametersBatch batch3({&sz0, &sz1});
|
|
nd4j::ops::gather gather3;
|
|
DeclarableBenchmark gather3d(gather3, "gather3d");
|
|
auto generator3 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
int sz0 = p.getIntParam("sz0");
|
|
int sz1 = p.getIntParam("sz1");
|
|
auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
|
|
auto indices = NDArrayFactory::create_<int>('c', {sz0});
|
|
|
|
int* a = new int[sz0];
|
|
for( int i=0; i<sz0; i++ ){
|
|
a[i] = i;
|
|
}
|
|
srand(12345);
|
|
std::random_shuffle(a, (a + sz0-1));
|
|
for( int i=0; i<sz0; i++ ){
|
|
indices->p(i, a[i]);
|
|
}
|
|
delete[] a;
|
|
|
|
ctx->setInputArray(0, in, true);
|
|
ctx->setInputArray(1, indices, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1}), true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&gather3d, generator3, batch3, "Gather - 3d");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string mismatchedOrdersAssignBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters rows("rows", 2, 2, mismatchedAssignPowLimit, 4); //2^2 to 2^26 in steps of 2 - 2^1=2, ..., 2^26=67108864
|
|
BoolParameters cf("cf");
|
|
|
|
ParametersBatch batch({&rows, &cf});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
int numElements = 67108864; //2^26
|
|
int rows = p.getIntParam("rows");
|
|
int cols = numElements / rows;
|
|
bool c = p.getIntParam("cf");
|
|
|
|
auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
|
|
auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
|
|
x.push_back(arr);
|
|
z.push_back(arr2);
|
|
};
|
|
|
|
TransformBenchmark tb(transform::AnyOps::Assign, "assign");
|
|
output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign");
|
|
|
|
//Also test: NCHW to NHWC and back
|
|
BoolParameters nchw("nchw");
|
|
ParametersBatch batch2({&nchw});
|
|
auto generator2 = PARAMETRIC_XZ() {
|
|
bool nchw = p.getIntParam("nchw");
|
|
|
|
if(nchw) {
|
|
auto orig = NDArrayFactory::create_<float>('c', {16, 32, 64, 64});
|
|
orig->permutei({0,2,3,1});
|
|
x.push_back(orig);
|
|
z.push_back(NDArrayFactory::create_<float>('c', {16, 64, 64, 32}));
|
|
} else {
|
|
auto orig = NDArrayFactory::create_<float>('c', {16, 64, 64, 32});
|
|
orig->permutei({0,3,1,2});
|
|
x.push_back(orig);
|
|
z.push_back(NDArrayFactory::create_<float>('c', {16, 32, 64, 64}));
|
|
}
|
|
};
|
|
|
|
TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
|
|
output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign");
|
|
return output;
|
|
}
|
|
|
|
static std::string broadcastOpsMatrixBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Broadcast ops: matrices for rank 3, 4, 5
|
|
for( int rank=3; rank <= broadcastMatrixRankLimit; rank++ ){
|
|
int numAxisTests = -1;
|
|
if(rank == 3){
|
|
numAxisTests = 3;
|
|
} else if(rank == 4){
|
|
numAxisTests = 6;
|
|
} else if(rank == 5){
|
|
numAxisTests = 10;
|
|
}
|
|
|
|
IntParameters testNum("testNum", 0,numAxisTests-1,1);
|
|
ParametersBatch b({&testNum});
|
|
|
|
auto generator = PARAMETRIC_D(){
|
|
int n = p.getIntParam("testNum");
|
|
std::vector<int> axis({});
|
|
switch(n){
|
|
//rank 3+
|
|
case 0:
|
|
axis = std::vector<int>({0,1});
|
|
break;
|
|
case 1:
|
|
axis = std::vector<int>({0,2});
|
|
break;
|
|
case 2:
|
|
axis = std::vector<int>({1,2});
|
|
break;
|
|
//rank 4+
|
|
case 3:
|
|
axis = std::vector<int>({0,3});
|
|
break;
|
|
case 4:
|
|
axis = std::vector<int>({1,3});
|
|
break;
|
|
case 5:
|
|
axis = std::vector<int>({2,3});
|
|
break;
|
|
//Rank 5
|
|
case 6:
|
|
axis = std::vector<int>({0,4});
|
|
break;
|
|
case 7:
|
|
axis = std::vector<int>({1,4});
|
|
break;
|
|
case 8:
|
|
axis = std::vector<int>({2,4});
|
|
break;
|
|
case 9:
|
|
axis = std::vector<int>({3,4});
|
|
break;
|
|
}
|
|
|
|
|
|
std::vector<Nd4jLong> shape({});
|
|
std::vector<Nd4jLong> toBcShape({});
|
|
int vectorLength;
|
|
if(rank == 3){
|
|
shape = std::vector<Nd4jLong>({64,64,64});
|
|
toBcShape = std::vector<Nd4jLong>({64,64,64});
|
|
vectorLength = 64;
|
|
} else if(rank == 4){
|
|
shape = std::vector<Nd4jLong>({32,32,32,32});
|
|
toBcShape = std::vector<Nd4jLong>({32,32,32,32});
|
|
vectorLength = 32;
|
|
} else if(rank == 5){
|
|
shape = std::vector<Nd4jLong>({16,16,16,16,16});
|
|
toBcShape = std::vector<Nd4jLong>({16,16,16,16,16});
|
|
vectorLength = 16;
|
|
}
|
|
|
|
for( int i=0; i<rank; i++ ){
|
|
if(axis[0] == i || axis[1] == i){
|
|
continue;
|
|
}
|
|
toBcShape[i] = 1;
|
|
}
|
|
|
|
auto ctx = new Context(1);
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', shape), true);
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', toBcShape), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', shape), true);
|
|
return ctx;
|
|
};
|
|
|
|
std::string name;
|
|
name += "Broadcast Matrix Add (Custom) - Rank";
|
|
name += std::to_string(rank);
|
|
|
|
nd4j::ops::add op;
|
|
DeclarableBenchmark benchmark(op, "add");
|
|
output += helper.runOperationSuit(&benchmark, generator, b, name.c_str());
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
|
|
static std::string broadcast2dBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
PredefinedParameters rows("rows", {65536});
|
|
IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^2, 2^6, 2^10
|
|
BoolParameters axis("axis");
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&rows, &cols, &axis, &inplace});
|
|
|
|
auto generator = PARAMETRIC_D() {
|
|
auto a = p.getIntParam("axis");
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
|
|
|
|
auto ctx = new Context(1);
|
|
ctx->setInputArray(0, arr, true);
|
|
if(a == 0){
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), 1}), true);
|
|
} else {
|
|
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
|
|
}
|
|
if (p.getIntParam("inplace") == 1) {
|
|
ctx->setOutputArray(0, arr);
|
|
ctx->markInplace(true);
|
|
} else {
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}), true);
|
|
}
|
|
return ctx;
|
|
};
|
|
|
|
std::string s("add");
|
|
nd4j::ops::add op;
|
|
DeclarableBenchmark benchmark(op, "add");
|
|
output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
|
|
return output;
|
|
}
|
|
|
|
static std::string broadcastBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
//Broadcast ops: vectors for rank 2, 3, 4, 5
|
|
for( int axis=0; axis<=1; axis++ ){
|
|
PredefinedParameters rows("rows", {65536});
|
|
IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&rows, &cols, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
|
|
x.push_back(arr);
|
|
if(axis == 0){
|
|
y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows")}));
|
|
} else {
|
|
y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("cols")}));
|
|
}
|
|
if (p.getIntParam("inplace") == 1) {
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
|
|
}
|
|
};
|
|
|
|
std::string s("bAdd"); s += std::to_string(axis); s += "r2";
|
|
BroadcastBenchmark bAdd(broadcast::Add, s, {axis});
|
|
output += helper.runOperationSuit(&bAdd, generator, batch, "Broadcast Add - Rank 2");
|
|
}
|
|
|
|
for( int rank=3; rank<=5; rank++ ){
|
|
for( int axis=1; axis<rank; axis++ ){
|
|
std::vector<Nd4jLong> shape({});
|
|
int vectorLength;
|
|
if(rank == 3){
|
|
shape = std::vector<Nd4jLong>({32,128,128});
|
|
vectorLength = 128;
|
|
} else if(rank == 4){
|
|
shape = std::vector<Nd4jLong>({16,64,64,64});
|
|
vectorLength = 64;
|
|
} else if(rank == 5){
|
|
shape = std::vector<Nd4jLong>({16,48,48,48,48});
|
|
vectorLength = 48;
|
|
}
|
|
|
|
ParametersBatch batch({});
|
|
|
|
//Note: always inplace here
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', shape);
|
|
x.push_back(arr);
|
|
y.push_back(NDArrayFactory::create_<float>('c', {vectorLength}));
|
|
z.push_back(arr);
|
|
};
|
|
|
|
std::string name("bArr-r"); name += std::to_string(rank); name += "a"; name += std::to_string(axis);
|
|
BroadcastBenchmark bAdd(broadcast::Add, name, {axis});
|
|
std::string n2("Broadcast Add - Rank"); n2 += std::to_string(rank); n2 += " - axis="; n2 += std::to_string(axis);
|
|
output += helper.runOperationSuit(&bAdd, generator, batch, n2.c_str());
|
|
}
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastStridedReductionNonEws() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters stride("stride", 2, 0, 10, 2); //2^0=1, ..., 2^10=1024
|
|
|
|
ParametersBatch batch({&stride});
|
|
|
|
//This is an edge case: technically an EWS *should* be available here
|
|
auto generator1 = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {131072 + (stride == 1 ? 0 : 1), stride});
|
|
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
IndicesList indices({NDIndex::interval(0,131072), NDIndex::interval(0,1)});
|
|
strided = arr->subarray(indices); //All rows, first column
|
|
delete arr;
|
|
}
|
|
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
|
|
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator1), batch, "Strided Sum - No EWS Test 1");
|
|
|
|
|
|
//No EWS defined for this case
|
|
auto generator2 = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride});
|
|
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
IndicesList indices({NDIndex::interval(0,2*1024,2), NDIndex::all(), NDIndex::interval(0,1)});
|
|
strided = arr->subarray(indices);
|
|
delete arr;
|
|
}
|
|
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
|
|
ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS");
|
|
output += helper.runOperationSuit(&rbSum2, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator2), batch, "Strided Sum - No EWS Test 2");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastStridedReductionIrregular() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4
|
|
PredefinedParameters stride("stride", {26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
|
|
122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
|
|
1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028});
|
|
|
|
ParametersBatch batch({&length, &stride});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
|
|
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
IndicesList indices({NDIndex::all(), NDIndex::interval(0,1)});
|
|
strided = arr->subarray(indices); //All rows, first column
|
|
delete arr;
|
|
}
|
|
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
|
|
|
|
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Irregular Strides");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastStridedReductionsRegular() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4
|
|
IntPowerParameters stride("stride", 2, 0, 10); //2^0=1, ..., 2^10=1024
|
|
|
|
ParametersBatch batch({&length, &stride});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
|
|
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
IndicesList indices({NDIndex::all(), NDIndex::point(0)});
|
|
strided = arr->subarray(indices); //All rows, first column
|
|
delete arr;
|
|
}
|
|
|
|
strided->assign(1.0);
|
|
x.push_back(strided);
|
|
y.push_back(nullptr);
|
|
// z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
z.push_back(NDArrayFactory::create_<float>('c', {1}));
|
|
};
|
|
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "Strided Sum");
|
|
|
|
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Regular Strides (powers of 2)");
|
|
|
|
auto generator3 = PARAMETRIC_D(){
|
|
auto ctx = new Context(1);
|
|
auto stride = p.getIntParam("stride");
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
|
|
|
|
NDArray* strided;
|
|
if(stride == 1){
|
|
strided = arr;
|
|
} else {
|
|
IndicesList indices({NDIndex::all(), NDIndex::point(0)});
|
|
strided = arr->subarray(indices); //All rows, first column
|
|
delete arr;
|
|
}
|
|
|
|
strided->assign(1.0);
|
|
ctx->setInputArray(0, strided, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {1}), true);
|
|
auto iargs = new Nd4jLong[1];
|
|
iargs[0] = 0;
|
|
ctx->setIArguments(iargs, 1);
|
|
delete[] iargs;
|
|
return ctx;
|
|
};
|
|
|
|
nd4j::ops::argmax opArgmax;
|
|
DeclarableBenchmark dbArgmax(opArgmax, "stridedArgmax");
|
|
output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Strided Argmax");
|
|
return output;
|
|
}
|
|
|
|
static std::string fastReduceAlongDimBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
int length[] = {1024*1024, 64*1024*1024};
|
|
int powLimit[] = {10, 20, 26};
|
|
int powStep[] = {2, 2, 4};
|
|
|
|
for( int i=0; i < limit3; i++ ){
|
|
IntPowerParameters rows("rows", 2, 0, powLimit[i], powStep[i]);
|
|
BoolParameters dim("dim");
|
|
|
|
|
|
ParametersBatch batch({&rows, &dim});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
int rows = p.getIntParam("rows");
|
|
int cols = length[i] / rows;
|
|
int dim = p.getIntParam("dim");
|
|
auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
|
|
|
|
|
|
x.push_back(arr);
|
|
y.push_back(NDArrayFactory::create_<Nd4jLong>(dim));
|
|
|
|
NDArray* result;
|
|
if(dim == 0){
|
|
result = NDArrayFactory::create_<float>('c', {cols});
|
|
} else {
|
|
result = NDArrayFactory::create_<float>('c', {rows});
|
|
}
|
|
z.push_back(result);
|
|
};
|
|
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
|
|
ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
|
|
|
|
std::string s1("Sum Along Dimension - ");
|
|
s1 += std::to_string(length[i]);
|
|
|
|
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
|
|
|
|
|
|
auto generator3 = PARAMETRIC_D(){
|
|
auto ctx = new Context(1);
|
|
int rows = p.getIntParam("rows");
|
|
int cols = length[i] / rows;
|
|
int dim = p.getIntParam("dim");
|
|
auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
|
|
|
|
Nd4jLong* dimArg = new Nd4jLong[1];
|
|
dimArg[0] = dim;
|
|
ctx->setIArguments(dimArg, 1);
|
|
delete[] dimArg;
|
|
|
|
ctx->setInputArray(0, arr, true);
|
|
|
|
NDArray* result;
|
|
if(dim == 0){
|
|
result = NDArrayFactory::create_<Nd4jLong>('c', {cols});
|
|
} else {
|
|
result = NDArrayFactory::create_<Nd4jLong>('c', {rows});
|
|
}
|
|
ctx->setOutputArray(0, result, true);
|
|
return ctx;
|
|
};
|
|
|
|
std::string s5("Argmax Along Dimension - ");
|
|
s5 += std::to_string(length[i]);
|
|
|
|
nd4j::ops::argmax opArgmax;
|
|
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
|
|
output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastReduceToScalarBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 10, reduceScalarPowLimit, 4); //2^10 to 2^26 in steps of 4
|
|
|
|
ParametersBatch batch({&length});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
|
|
x.push_back(arr);
|
|
y.push_back(nullptr);
|
|
z.push_back(NDArrayFactory::create_<float>(0.0f));
|
|
};
|
|
|
|
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
|
|
|
|
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
|
|
|
|
//Index reduction
|
|
nd4j::ops::argmax opArgmax;
|
|
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
|
|
auto generator3 = PARAMETRIC_D(){
|
|
auto ctx = new Context(1);
|
|
|
|
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
|
|
ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong>((Nd4jLong)0), true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>(0), true);
|
|
|
|
return ctx;
|
|
};
|
|
output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastNonEwsTransformBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
IntPowerParameters rowcol("rowcol", 2, 2, nonEwsPowLimit, 4); //2^2 to 2^14 in steps of 4 -> non-inplace case: 2x 2^10 x 2^10 = 128mb
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&rowcol, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
int r = p.getIntParam("rowcol");
|
|
auto arr = NDArrayFactory::create_<float>('c', {r, r+1});
|
|
IndicesList indices({NDIndex::all(), NDIndex::interval(0,r-1)});
|
|
auto view = arr->subarray(indices);
|
|
//nd4j_printf("VIEW ARRAY: rows=%lld, columns=%lld", view->sizeAt(0), view->sizeAt(1));
|
|
x.push_back(view);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(view);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {r,r}));
|
|
}
|
|
delete arr;
|
|
};
|
|
|
|
ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU_View");
|
|
sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
|
|
|
|
TransformBenchmark tbExp(transform::StrictOps::Exp, "exp view");
|
|
|
|
output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU View");
|
|
output += helper.runOperationSuit(&tbExp, generator, batch, "Exp View");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastPairwiseBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
IntPowerParameters length("length", 2, 10, pairwisePowLimit, 4); //2^10 to 2^26 in steps of 4 -> max is 512mb
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XYZ() {
|
|
auto arr1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
auto arr2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
x.push_back(arr1);
|
|
y.push_back(arr2);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr1);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
|
|
output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");
|
|
|
|
PairwiseBenchmark pb2(pairwise::Ops::Add, "Multiply");
|
|
output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Multiply");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string heavyTransformsBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
IntPowerParameters length("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22, steps of 4
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
arr->assign(1.0);
|
|
x.push_back(arr);
|
|
if (p.getIntParam("inplace") == 1) {
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
//Ops to test: erf (transform), betainc (custom), polygamma, synthetic ops?
|
|
TransformBenchmark erf(transform::StrictOps::Erf, "Erf");
|
|
output += helper.runOperationSuit(&erf, generator, batch, "Error Function (Erf)");
|
|
|
|
ParametersBatch batch2({&length});
|
|
nd4j::ops::polygamma op1;
|
|
DeclarableBenchmark pg(op1, "polygamma");
|
|
auto generator2 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
in0->assign(0.25);
|
|
auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
in1->assign(0.5);
|
|
ctx->setInputArray(0, in0, true);
|
|
ctx->setInputArray(1, in1, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
|
|
return ctx;
|
|
};
|
|
|
|
|
|
IntPowerParameters lengthBetaInc("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22 in steps of 4
|
|
ParametersBatch batch3({&lengthBetaInc});
|
|
nd4j::ops::betainc op2;
|
|
DeclarableBenchmark binc(op2, "betainc");
|
|
auto generator3 = PARAMETRIC_D() {
|
|
auto ctx = new Context(1);
|
|
auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
in0->assign(0.25);
|
|
auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
in1->assign(0.5);
|
|
auto in2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
in2->assign(0.75);
|
|
ctx->setInputArray(0, in0, true);
|
|
ctx->setInputArray(1, in1, true);
|
|
ctx->setInputArray(2, in2, true);
|
|
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
|
|
return ctx;
|
|
};
|
|
|
|
output += helper.runOperationSuit(&pg, generator2, batch2, "PolyGamma Function");
|
|
output += helper.runOperationSuit(&binc, generator3, batch3, "Incomplete Beta Function (BetaInc)");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string intermediateTransformsBenchmark() {
|
|
std::string output;
|
|
|
|
//Non-inplace: 2x 2^26 elements FP32 -> 512MB
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
IntPowerParameters length("length", 2, 10, intermediateTransformPowLimit, 4); //2^20 to 2^22 in steps of 4
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
arr->assign(1.0);
|
|
x.push_back(arr);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
TransformBenchmark tbTanh(transform::StrictOps::Tanh, "tanh");
|
|
TransformBenchmark tbGelu(transform::StrictOps::GELU, "gelu");
|
|
|
|
output += helper.runOperationSuit(&tbTanh, generator, batch, "Tanh");
|
|
output += helper.runOperationSuit(&tbGelu, generator, batch, "gelu");
|
|
|
|
|
|
//2x 1024 cols x 2^18 = 2GB
|
|
IntPowerParameters rows("rows", 2, 10, intermediateTransformPowLimit2, 4);
|
|
PredefinedParameters cols("cols", {4, 128, 1024});
|
|
|
|
ParametersBatch batch2({&rows, &cols, &inplace});
|
|
|
|
auto generator2 = PARAMETRIC_XZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
|
|
arr->assign(1.0);
|
|
x.push_back(arr);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
|
|
}
|
|
};
|
|
|
|
//TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");
|
|
|
|
//output += helper.runOperationSuit(&tbSoftmax, generator2, batch2, "Softmax");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastTransformsBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
IntPowerParameters length("length", 2, 10, transformBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
arr->assign(1.0);
|
|
x.push_back(arr);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU");
|
|
sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
|
|
|
|
TransformBenchmark tbAbs(transform::SameOps::Abs, "abs");
|
|
TransformBenchmark tbExp(transform::StrictOps::Exp, "exp");
|
|
|
|
output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU");
|
|
output += helper.runOperationSuit(&tbAbs, generator, batch, "Abs");
|
|
output += helper.runOperationSuit(&tbExp, generator, batch, "Exp");
|
|
|
|
return output;
|
|
}
|
|
|
|
static std::string fastScalarBenchmark() {
|
|
std::string output;
|
|
BenchmarkHelper helper(wIterations, rIterations);
|
|
|
|
IntPowerParameters length("length", 2, 10, scalarBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
|
|
BoolParameters inplace("inplace");
|
|
|
|
ParametersBatch batch({&length, &inplace});
|
|
|
|
auto generator = PARAMETRIC_XZ() {
|
|
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
|
|
arr->assign(1.0);
|
|
x.push_back(arr);
|
|
if(p.getIntParam("inplace") == 1){
|
|
z.push_back(arr);
|
|
} else {
|
|
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
|
|
}
|
|
};
|
|
|
|
ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
|
|
ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
|
|
ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");
|
|
|
|
|
|
sbAdd.setY(NDArrayFactory::create_<float>(3.14159265359));
|
|
sbDiv.setY(NDArrayFactory::create_<float>(3.14159265359));
|
|
sbPow.setY(NDArrayFactory::create_<float>(3.14159265359));
|
|
|
|
|
|
output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359) - F32");
|
|
output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359) - F32");
|
|
output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359) - F32");
|
|
|
|
return output;
|
|
}
|
|
|
|
|
|
static long nowMs(){
|
|
auto s = std::chrono::system_clock::now().time_since_epoch();
|
|
auto v = std::chrono::duration_cast<std::chrono::milliseconds>(s).count();
|
|
return v;
|
|
}
|
|
|
|
static long duration(long start){
|
|
return nowMs() - start;
|
|
}
|
|
|
|
static long done(long start){
|
|
long dur = duration(start);
|
|
nd4j_printf("Done: %i ms\n", dur);
|
|
return nowMs();
|
|
}
|
|
|
|
|
|
std::string FullBenchmarkSuit::runSuit() {
|
|
std::string result;
|
|
|
|
long start = nowMs();
|
|
|
|
// set 1
|
|
nd4j_printf("Running FullBenchmarkSuite.fastScalarBenchmark\n", "");
|
|
result += fastScalarBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastTransformsBenchmark\n", "");
|
|
result += fastTransformsBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.intermediateTransformsBenchmark\n", "");
|
|
result += intermediateTransformsBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastPairwiseBenchmark\n", "");
|
|
result += fastPairwiseBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.heavyTransformsBenchmark\n", "");
|
|
result += heavyTransformsBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastNonEwsTransformBenchmark\n", "");
|
|
result += fastNonEwsTransformBenchmark();
|
|
start = done(start);
|
|
|
|
// set 2
|
|
nd4j_printf("Running FullBenchmarkSuite.fastReduceToScalarBenchmark\n", "");
|
|
result += fastReduceToScalarBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastReduceAlongDimBenchmark\n", "");
|
|
result += fastReduceAlongDimBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionsRegular\n", "");
|
|
result += fastStridedReductionsRegular();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionIrregular\n", "");
|
|
result += fastStridedReductionIrregular();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionNonEws\n", "");
|
|
result += fastStridedReductionNonEws();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.broadcastBenchmark\n", "");
|
|
result += broadcastBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.broadcast2dBenchmark\n", "");
|
|
result += broadcast2dBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.broadcastOpsMatrixBenchmark\n", "");
|
|
result += broadcastOpsMatrixBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.mismatchedOrdersAssignBenchmark\n", "");
|
|
result += mismatchedOrdersAssignBenchmark();
|
|
start = done(start);
|
|
|
|
|
|
// set 3
|
|
nd4j_printf("Running FullBenchmarkSuite.gatherOpBenchmark\n", "");
|
|
result += gatherOpBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.scatterOpBenchmark\n", "");
|
|
result += scatterOpBenchmark();
|
|
start = done(start);
|
|
|
|
// set 4
|
|
nd4j_printf("Running FullBenchmarkSuite.gemmRegularBenchmark\n", "");
|
|
result += gemmRegularBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.gemmIrregularBenchmark\n", "");
|
|
result += gemmIrregularBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.rngBenchmark\n", "");
|
|
result += rngBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.conv2dBenchmark\n", "");
|
|
result += conv2dBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.pool2dBenchmark\n", "");
|
|
result += pool2dBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.batchnormBenchmark\n", "");
|
|
result += batchnormBenchmark();
|
|
start = done(start);
|
|
|
|
nd4j_printf("Running FullBenchmarkSuite.lstmBenchmark\n", "");
|
|
result += lstmBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.conv3dBenchmark\n", "");
|
|
result += conv3dBenchmark();
|
|
start = done(start);
|
|
nd4j_printf("Running FullBenchmarkSuite.maxPool3DBenchmark\n", "");
|
|
result += maxPool3DBenchmark();
|
|
start = done(start);
|
|
// nd4j_printf("Running FullBenchmarkSuite.layerNormBenchmark\n", "");
|
|
// result += layerNormBenchmark();
|
|
// start = done(start);
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
} |