cavis/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp

1924 lines
73 KiB
C++

/* ******************************************************************************
*
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <ops/declarable/CustomOperations.h>
#include <performance/benchmarking/FullBenchmarkSuit.h>
#include <ops/declarable/LegacyRandomOp.h>
#include <algorithm>
#ifdef RELEASE_BUILD
int wIterations = 4;
int rIterations = 20;
int gemmRegularUpperPow = 11;
int scalarBenchmarkPowLimit = 26;
int transformBenchmarkPowLimit = 26;
int intermediateTransformPowLimit = 22;
int intermediateTransformPowLimit2 = 18;
int pairwisePowLimit = 26;
int heavyPowLimit = 22;
int nonEwsPowLimit = 10;
int reduceScalarPowLimit = 26;
int stridedReductionPowLimit = 20;
int mismatchedAssignPowLimit = 26;
int gatherOpPowLimit = 18;
int gatherOpPowLimit2 = 16;
int gatherOpPowLimit3 = 12;
int broadcastMatrixRankLimit = 5;
int limit30 = 30;
int limit26 = 26;
int limit24 = 24;
int limit22 = 22;
int limit20 = 20;
int limit18 = 18;
int limit10 = 10;
int limit5 = 5;
int limit3 = 3;
#else
int wIterations = 0;
int rIterations = 1;
int gemmRegularUpperPow = 7;
int scalarBenchmarkPowLimit = 10;
int transformBenchmarkPowLimit = 10;
int intermediateTransformPowLimit = 10;
int intermediateTransformPowLimit2 = 10;
int pairwisePowLimit = 10;
int heavyPowLimit = 10;
int nonEwsPowLimit = 6;
int reduceScalarPowLimit = 10;
int stridedReductionPowLimit = 12;
int mismatchedAssignPowLimit = 2;
int gatherOpPowLimit = 10;
int gatherOpPowLimit2 = 8;
int gatherOpPowLimit3 = 8;
int broadcastMatrixRankLimit = 3;
int limit26 = 8;
int limit24 = 8;
int limit22 = 8;
int limit20 = 8;
int limit18 = 8;
int limit10 = 4;
int limit5 = 3;
int limit3 = 1;
#endif
namespace sd {
static std::string layerNormBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
BoolParameters nhwc("nhwc"); //0 = nchw
#ifdef _RELEASE
int c = 32;
int hw = 64;
#else
int c = 3;
int hw = 8;
#endif
ParametersBatch batch({&nhwc});
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int axis;
if (n == 0) {
//nchw
auto input = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
auto output = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
axis = 1;
} else {
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
axis = 3;
}
auto bias = NDArrayFactory::create_<float>('c', {c});
ctx->setInputArray(1, bias, true);
auto iargs = new Nd4jLong[1];
iargs[0] = axis;
ctx->setIArguments(iargs, 1);
delete[] iargs;
return ctx;
};
sd::ops::layer_norm layerNorm;
DeclarableBenchmark benchmark(layerNorm, "layer norm");
output += helper.runOperationSuit(&benchmark, generator, batch, "Layer Norm");
return output;
}
static std::string maxPool3DBenchmark(){
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
BoolParameters ncdhw("ncdhw"); //1 = ndhwc
ParametersBatch batch({&ncdhw});
sd::ops::maxpool3dnew maxpool3Dnew;
DeclarableBenchmark benchmark(maxpool3Dnew, "maxPool3d");
#ifdef _RELEASE
int mb = 16;
int chIn = 16;
int chOut = 16;
int dhw = 64;
#else
int mb = 1;
int chIn = 3;
int chOut = 3;
int dhw = 16;
#endif
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int format = p.getIntParam("ncdhw");
//Set inputs and outputs
//Same mode + stride 1: output is same shape as input
if(format == 1) {
//NDHWC
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
} else {
//NCDHW
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
}
auto iargs = new Nd4jLong[15];
//Kernel, strides, padding, dilation - x3 each
iargs[0] = 3; //Kernel
iargs[1] = 3;
iargs[2] = 3;
iargs[3] = 1; //Stride
iargs[4] = 1;
iargs[5] = 1;
iargs[6] = 0; //Padding
iargs[7] = 0;
iargs[8] = 0;
iargs[9] = 1; //Dilation
iargs[10] = 1;
iargs[11] = 1;
iargs[12] = 1; //Same mode
iargs[13] = 0; //Unused for max
iargs[14] = format; //0 = ncdhw
ctx->setIArguments(iargs, 14);
delete[] iargs;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "maxPool3d");
return output;
}
static std::string conv3dBenchmark(){
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
BoolParameters ncdhw("ncdhw"); //1 = ndhwc
ParametersBatch batch({&ncdhw});
sd::ops::conv3dnew conv3Dnew;
DeclarableBenchmark benchmark(conv3Dnew, "conv3d");
#ifdef _RELEASE
int mb = 16;
int chIn = 16;
int chOut = 16;
int dhw = 64;
#else
int mb = 1;
int chIn = 3;
int chOut = 3;
int dhw = 16;
#endif
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int format = p.getIntParam("ncdhw");
//Set inputs and outputs
//Same mode + stride 1: output is same shape as input
if(format == 1) {
//NDHWC
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
} else {
//NCDHW
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
}
//Weights and bias:
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {3, 3, 3, chIn, chOut}), true);
ctx->setInputArray(2, NDArrayFactory::create_<float>('c', {chOut}), true);
auto iargs = new Nd4jLong[14];
//Kernel, strides, padding, dilation - x3 each
iargs[0] = 3; //Kernel
iargs[1] = 3;
iargs[2] = 3;
iargs[3] = 1; //Stride
iargs[4] = 1;
iargs[5] = 1;
iargs[6] = 0; //Padding
iargs[7] = 0;
iargs[8] = 0;
iargs[9] = 1; //Dilation
iargs[10] = 1;
iargs[11] = 1;
iargs[12] = 1; //Same mode
iargs[13] = format; //0 = ncdhw
ctx->setIArguments(iargs, 14);
delete[] iargs;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "CNN3D");
return output;
}
static std::string lstmBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
BoolParameters format("format"); //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
#ifdef _RELEASE
PredefinedParameters mb("mb", {1, 8, 64});
PredefinedParameters nInOut("nInOut", {32, 256, 1024});
#else
PredefinedParameters mb("mb", {1});
PredefinedParameters nInOut("nInOut", {32});
#endif
ParametersBatch batch({&format, &mb, &nInOut});
sd::ops::lstmBlock lstmBlock;
DeclarableBenchmark benchmark(lstmBlock, "lstm");
int seqLength = 32;
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int f = p.getIntParam("format");
int m = p.getIntParam("mb");
int n = p.getIntParam("nInOut");
Nd4jLong l = 0;
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l), true); //Max TS length (unused)
if (f == 0) {
//TNS format
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //x
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //i
ctx->setOutputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //c
ctx->setOutputArray(2, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //f
ctx->setOutputArray(3, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //o
ctx->setOutputArray(4, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //z
ctx->setOutputArray(5, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //h
ctx->setOutputArray(6, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true); //y
} else {
//NST format
ctx->setInputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //x
ctx->setOutputArray(0, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //i
ctx->setOutputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //c
ctx->setOutputArray(2, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //f
ctx->setOutputArray(3, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //o
ctx->setOutputArray(4, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //z
ctx->setOutputArray(5, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //h
ctx->setOutputArray(6, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true); //y
}
auto cLast = NDArrayFactory::create_<float>('c', {m, n});
auto yLast = NDArrayFactory::create_<float>('c', {m, n});
auto W = NDArrayFactory::create_<float>('c', {2 * n, 4 * n});
auto Wci = NDArrayFactory::create_<float>('c', {n});
auto Wcf = NDArrayFactory::create_<float>('c', {n});
auto Wco = NDArrayFactory::create_<float>('c', {n});
auto b = NDArrayFactory::create_<float>('c', {4 * n});
ctx->setInputArray(2, cLast, true);
ctx->setInputArray(3, yLast, true);
ctx->setInputArray(4, W, true);
ctx->setInputArray(5, Wci, true);
ctx->setInputArray(6, Wcf, true);
ctx->setInputArray(7, Wco, true);
ctx->setInputArray(8, b, true);
auto iargs = new Nd4jLong[2];
iargs[0] = 0; //No peephole
iargs[1] = f;
ctx->setIArguments(iargs, 2);
delete[] iargs;
auto targs = new double[2];
targs[0] = 1.0; //forget bias
targs[1] = 0.0; //cell clipping value
ctx->setTArguments(targs, 2);
delete[] targs;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
return output;
}
static std::string batchnormBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Convolution2D op
BoolParameters nhwc("nhwc");
#ifdef _RELEASE
PredefinedParameters c("c", {3, 32, 128});
PredefinedParameters hw("hw", {32, 128});
#else
PredefinedParameters c("c", {3});
PredefinedParameters hw("hw", {16});
#endif
ParametersBatch batch({&nhwc, &c, &hw});
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int hw = p.getIntParam("hw");
int ch = p.getIntParam("c");
auto args = new Nd4jLong[3];
args[0] = args[1] = 1; //apply scale and offset
if (n == 0) {
auto input = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
auto output = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
args[2] = 1; //axis
} else {
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
args[2] = 3; //axis
}
ctx->setIArguments(args, 3);
delete[] args;
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {ch}), true); //mean
auto v = NDArrayFactory::create_<float>('c', {ch});
v->assign(1.0f);
ctx->setInputArray(2, v, true); //variance
auto g = NDArrayFactory::create_<float>('c', {ch});
g->assign(1.0);
ctx->setInputArray(3, g, true); //gamma
auto b = NDArrayFactory::create_<float>('c', {ch});
b->assign(1.0);
ctx->setInputArray(4, b, true); //beta
auto targs = new double[1];
targs[0] = 1e-5;
ctx->setTArguments(targs, 1);
delete[] targs;
return ctx;
};
sd::ops::batchnorm batchnorm;
DeclarableBenchmark benchmark(batchnorm, "batchnorm");
output += helper.runOperationSuit(&benchmark, generator, batch, "Batch Normalization");
return output;
}
static std::string pool2dBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Convolution2D op
BoolParameters nhwc("nhwc");
#ifdef _RELEASE
PredefinedParameters k("k", {2, 3, 5});
PredefinedParameters c("c", {3, 32, 128});
PredefinedParameters hw("hw", {32, 128});
#else
PredefinedParameters k("k", {2});
PredefinedParameters c("c", {3});
PredefinedParameters hw("hw", {8});
#endif
ParametersBatch batch({&nhwc, &k, &c, &hw});
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int hw = p.getIntParam("hw");
int khw = p.getIntParam("k");
if (n == 0) {
auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
} else {
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
}
auto args = new Nd4jLong[11];
args[0] = args[1] = khw; //Kernel
args[2] = args[3] = 1;//Stride
args[4] = args[5] = 0; //Pad
args[6] = args[7] = 1; //Dilation
args[8] = 1; //SAME
args[9] = 0; //Divisor mode - 0 = exclude padding in divisor
args[10] = n;//0-nchw, 1=nhwc
ctx->setIArguments(args, 11);
delete[] args;
return ctx;
};
sd::ops::avgpool2d avgpool2d;
DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pooling 2d Operation");
sd::ops::maxpool2d maxpool2d;
DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pooling 2d Operation");
return output;
}
static std::string conv2dBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Convolution2D op
BoolParameters nhwc("nhwc");
#ifdef _RELEASE
PredefinedParameters k("k", {2, 3, 5});
PredefinedParameters c("c", {3, 32, 128});
PredefinedParameters hw("hw", {32, 128});
#else
PredefinedParameters k("k", {2});
PredefinedParameters c("c", {3});
PredefinedParameters hw("hw", {8});
#endif
ParametersBatch batch({&nhwc, &k, &c, &hw});
sd::ops::conv2d conv2d;
DeclarableBenchmark benchmark(conv2d, "conv2d");
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int hw = p.getIntParam("hw");
int khw = p.getIntParam("k");
if (n == 0) {
auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
} else {
auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
}
auto b = NDArrayFactory::create_<float>('c', {p.getIntParam("c")});
auto w = NDArrayFactory::create_<float>('c', {khw, khw, p.getIntParam("c"), p.getIntParam("c")}); // [kH, kW, iC, oC] always
ctx->setInputArray(1, w, true);
ctx->setInputArray(2, b, true);
auto args = new Nd4jLong[10];
args[0] = args[1] = khw; //Kernel
args[2] = args[3] = 1;//Stride
args[4] = args[5] = 0; //Pad
args[6] = args[7] = 1; //Dilation
args[8] = 1; //SAME
args[9] = n;//0-nchw, 1=nhwc
ctx->setIArguments(args, 10);
delete[] args;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d Operation");
return output;
}
static std::string rngBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Uniform, gaussian and bernoulli RNG generation
IntPowerParameters length("length", 2, 4, scalarBenchmarkPowLimit, 3); //2^8 to 2^30 in steps of 3
ParametersBatch batch({&length});
auto gen01 = PARAMETRIC_D() {
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
auto d = new double[2];
d[0] = 0.0;
d[1] = 1.0;
ctx->setTArguments(d, 2);
delete[] d;
return ctx;
};
auto gen05 = PARAMETRIC_D() {
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
auto d = new double[1];
d[0] = 0.5;
ctx->setTArguments(d, 1);
delete[] d;
return ctx;
};
sd::ops::LegacyRandomOp unif(random::UniformDistribution);
DeclarableBenchmark dbU(unif, "uniform");
output += helper.runOperationSuit(&dbU, gen01, batch, "Uniform Distribution");
sd::ops::LegacyRandomOp gaussian(random::GaussianDistribution);
DeclarableBenchmark dbG(gaussian, "gaussian");
output += helper.runOperationSuit(&dbG, gen01, batch, "Gaussian Distribution");
sd::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution);
DeclarableBenchmark dbTU(unif, "trunc.norm");
output += helper.runOperationSuit(&dbTU, gen01, batch, "Truncated Normal Distribution");
sd::ops::LegacyRandomOp ln(random::LogNormalDistribution);
DeclarableBenchmark dbLN(ln, "uniform");
output += helper.runOperationSuit(&dbLN, gen01, batch, "Log Normal Distribution");
sd::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution);
DeclarableBenchmark dbB(bernoulli, "bernoulli");
output += helper.runOperationSuit(&dbB, gen05, batch, "Bernoulli Distribution");
sd::ops::LegacyRandomOp dropout(random::BernoulliDistribution);
DeclarableBenchmark dbD(dropout, "dropout");
output += helper.runOperationSuit(&dbD, gen05, batch, "Dropout");
return output;
}
static std::string gemmIrregularBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Basically the same as above, but with irregular shapes (not multiples of 8, etc)
#ifdef _RELEASE
int tAMax = 1;
int tBMax = 1;
int b = 1024;
int c = 1024;
#else
int tAMax = 1;
int tBMax = 1;
int b = 32;
int c = 32;
#endif
for (int tA = 0; tA <= tAMax; tA++) {
for (int tB = 0; tB <= tBMax; tB++) {
IntParameters d("d", 1020, 1028, 1); //1020, 1021, ..., 1028
ParametersBatch dim({&d});
//Vary A.rows:
auto generator = PARAMETRIC_XYZ() {
auto a = p.getIntParam("d");
std::vector<Nd4jLong> shapeA;
std::vector<Nd4jLong> shapeB;
if (tA) {
shapeA = {b, a};
} else {
shapeA = {a, b};
}
if (tB) {
shapeB = {c, b};
} else {
shapeB = {b, c};
}
auto A = NDArrayFactory::create_<float>('c', shapeA);
auto B = NDArrayFactory::create_<float>('c', shapeB);
auto C = NDArrayFactory::create_<float>('f', {a, c});
x.push_back(A);
y.push_back(B);
z.push_back(C);
};
std::string n;
n += "Gemm (a.rows) - tA=";
n += std::to_string(tA);
n += ", tB=";
n += std::to_string(tB);
MatrixBenchmark mb(1.0, 0.0, tA, tB, n);
output += helper.runOperationSuit(&mb, generator, dim, n.c_str());
//Vary A.columns / B.rows
auto generator2 = PARAMETRIC_XYZ() {
auto a = 1024;
auto b = p.getIntParam("d");
auto c = 1024;
std::vector<Nd4jLong> shapeA;
std::vector<Nd4jLong> shapeB;
if (tA) {
shapeA = {b, a};
} else {
shapeA = {a, b};
}
if (tB) {
shapeB = {c, b};
} else {
shapeB = {b, c};
}
auto A = NDArrayFactory::create_<float>('c', shapeA);
auto B = NDArrayFactory::create_<float>('c', shapeB);
auto C = NDArrayFactory::create_<float>('f', {a, c});
x.push_back(A);
y.push_back(B);
z.push_back(C);
};
std::string n2;
n2 += "Gemm (a.columns) - tA=";
n2 += std::to_string(tA);
n2 += ", tB=";
n2 += std::to_string(tB);
MatrixBenchmark mb2(1.0, 0.0, tA, tB, n2);
output += helper.runOperationSuit(&mb2, generator2, dim, n2.c_str());
//Vary A.columns / B.rows
auto generator3 = PARAMETRIC_XYZ() {
auto a = 1024;
auto b = 1024;
auto c = p.getIntParam("d");
std::vector<Nd4jLong> shapeA;
std::vector<Nd4jLong> shapeB;
if (tA) {
shapeA = {b, a};
} else {
shapeA = {a, b};
}
if (tB) {
shapeB = {c, b};
} else {
shapeB = {b, c};
}
auto A = NDArrayFactory::create_<float>('c', shapeA);
auto B = NDArrayFactory::create_<float>('c', shapeB);
auto C = NDArrayFactory::create_<float>('f', {a, c});
x.push_back(A);
y.push_back(B);
z.push_back(C);
};
std::string n3;
n3 += "Gemm (b.columns) - tA=";
n3 += std::to_string(tA);
n3 += ", tB=";
n3 += std::to_string(tB);
MatrixBenchmark mb3(1.0, 0.0, tA, tB, n);
output += helper.runOperationSuit(&mb3, generator3, dim, n3.c_str());
}
}
return output;
}
static std::string batchGemmBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Rank 3 - [32,1024,1024]x[32,1024,1024]
//Rank 4 - [4,8,1024,1024]x[4,8,1024,1024]
IntParameters rank("rank", 3, 4, 1);
ParametersBatch b({&rank});
auto generator = PARAMETRIC_D() {
auto rank = p.getIntParam("rank");
std::vector<Nd4jLong> shapeA;
std::vector<Nd4jLong> shapeB;
auto ctx = new Context(1);
if(rank == 3){
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
} else {
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
}
return ctx;
};
sd::ops::matmul mmul;
DeclarableBenchmark benchmark(mmul, "mmul (batch)");
output += helper.runOperationSuit(&benchmark, generator, b, "MMul (batch)");
return output;
}
static std::string gemmRegularBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
for (int o = 0; o <= 1; o++) {
char resultOrder = (o == 0 ? 'f' : 'c');
for (int tA = 0; tA <= 1; tA++) {
for (int tB = 0; tB <= 1; tB++) {
IntPowerParameters pa("sz", 2, 7, gemmRegularUpperPow, 2); //2^7=128, 2^9=512, 2^11=2048
ParametersBatch b({&pa});
auto generator = PARAMETRIC_XYZ() {
auto s = p.getIntParam("sz");
auto A = NDArrayFactory::create_<float>('c', {s, s});
auto B = NDArrayFactory::create_<float>('c', {s, s});
auto C = NDArrayFactory::create_<float>(resultOrder, {s, s});
x.push_back(A);
y.push_back(B);
z.push_back(C);
};
std::string n;
n += "Gemm - tA=";
n += std::to_string(tA);
n += ", tB=";
n += std::to_string(tB);
n += ", cOrder=";
n += resultOrder;
MatrixBenchmark mb(1.0, 0.0, tA == 0 ? false : true, tB == 0 ? false : true, n);
output += helper.runOperationSuit(&mb, generator, b, n.c_str());
}
}
}
return output;
}
static std::string scatterOpBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^26 in steps of 4
ParametersBatch batch({&length});
//Gather 1D tests - 1d ref, 1d indices, 1d updates -> 1d output
sd::ops::scatter_upd scatter_update1;
DeclarableBenchmark sa1d(scatter_update1, "scatter_update1d");
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int length = p.getIntParam("length");
auto in = NDArrayFactory::create_<float>('c', {length});
auto indices = NDArrayFactory::create_<int>('c', {length});
auto updates = NDArrayFactory::create_<float>('c', {length});
int* a = new int[length];
for( int i=0; i<length; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + length-1));
for( int i=0; i<length; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setInputArray(2, updates, true);
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
ctx->markInplace(true);
return ctx;
};
output += helper.runOperationSuit(&sa1d, generator, batch, "Scatter Update - 1d");
//Gather 2D tests - 2d input, 1d indices, 2d updates -> 2d output
IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^16 in steps of 2: 2^10, ..., 2^20
PredefinedParameters cols("cols", {32});
ParametersBatch batch2({&rows, &cols});
sd::ops::scatter_upd scatter_update2;
DeclarableBenchmark sa2d(scatter_update2, "scatter_update2d");
auto generator2 = PARAMETRIC_D() {
auto ctx = new Context(1);
int rows = p.getIntParam("rows");
int cols = p.getIntParam("cols");
auto in = NDArrayFactory::create_<float>('c', {rows, cols});
auto indices = NDArrayFactory::create_<int>('c', {rows});
auto updates = NDArrayFactory::create_<float>('c', {rows, cols});
int* a = new int[rows];
for( int i=0; i<rows; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + rows-1));
for( int i=0; i<rows; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setInputArray(2, updates, true);
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
ctx->markInplace(true);
return ctx;
};
output += helper.runOperationSuit(&sa2d, generator2, batch2, "Scatter Update - 2d");
//Gather 3D tests - 3d input, 1d indices -> 3d output
IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);
PredefinedParameters sz1("sz1", {32});
ParametersBatch batch3({&sz0, &sz1});
sd::ops::scatter_upd scatter_update3;
DeclarableBenchmark sa3d(scatter_update3, "scatter3d");
auto generator3 = PARAMETRIC_D() {
auto ctx = new Context(1);
int sz0 = p.getIntParam("sz0");
int sz1 = p.getIntParam("sz1");
auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
auto indices = NDArrayFactory::create_<int>('c', {sz0});
auto updates = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
int* a = new int[sz0];
for( int i=0; i<sz0; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + sz0-1));
for( int i=0; i<sz0; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setInputArray(2, updates, true);
ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy!
ctx->markInplace(true);
return ctx;
};
output += helper.runOperationSuit(&sa3d, generator3, batch3, "Scatter Update - 3d");
return output;
}
static std::string gatherOpBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^22 in steps of 4
ParametersBatch batch({&length});
//Gather 1D tests - 1d input, 1d indices -> 1d output
sd::ops::gather gather1;
DeclarableBenchmark gather1d(gather1, "gather1d");
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int length = p.getIntParam("length");
auto in = NDArrayFactory::create_<float>('c', {length});
auto indices = NDArrayFactory::create_<int>('c', {length});
int* a = new int[length];
for( int i=0; i<length; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + length-1));
for( int i=0; i<length; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {length}), true);
return ctx;
};
output += helper.runOperationSuit(&gather1d, generator, batch, "Gather - 1d");
//Gather 2D tests - 2d input, 1d indices -> 2d output
IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^20 in steps of 2: 2^10, ..., 2^20
PredefinedParameters cols("cols", {32});
ParametersBatch batch2({&rows, &cols});
sd::ops::gather gather2;
DeclarableBenchmark gather2d(gather2, "gather2d");
auto generator2 = PARAMETRIC_D() {
auto ctx = new Context(1);
int rows = p.getIntParam("rows");
int cols = p.getIntParam("cols");
auto in = NDArrayFactory::create_<float>('c', {rows, cols});
auto indices = NDArrayFactory::create_<int>('c', {rows});
int* a = new int[rows];
for( int i=0; i<rows; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + rows-1));
for( int i=0; i<rows; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, cols}), true);
return ctx;
};
output += helper.runOperationSuit(&gather2d, generator2, batch2, "Gather - 2d");
//Gather 3D tests - 3d input, 1d indices -> 3d output
IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4); //2^8 to 2^16 in steps of 4
PredefinedParameters sz1("sz1", {32});
ParametersBatch batch3({&sz0, &sz1});
sd::ops::gather gather3;
DeclarableBenchmark gather3d(gather3, "gather3d");
auto generator3 = PARAMETRIC_D() {
auto ctx = new Context(1);
int sz0 = p.getIntParam("sz0");
int sz1 = p.getIntParam("sz1");
auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
auto indices = NDArrayFactory::create_<int>('c', {sz0});
int* a = new int[sz0];
for( int i=0; i<sz0; i++ ){
a[i] = i;
}
srand(12345);
std::random_shuffle(a, (a + sz0-1));
for( int i=0; i<sz0; i++ ){
indices->p(i, a[i]);
}
delete[] a;
ctx->setInputArray(0, in, true);
ctx->setInputArray(1, indices, true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1}), true);
return ctx;
};
output += helper.runOperationSuit(&gather3d, generator3, batch3, "Gather - 3d");
return output;
}
static std::string mismatchedOrdersAssignBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters rows("rows", 2, 2, mismatchedAssignPowLimit, 4); //2^2 to 2^26 in steps of 2 - 2^1=2, ..., 2^26=67108864
BoolParameters cf("cf");
ParametersBatch batch({&rows, &cf});
auto generator = PARAMETRIC_XZ() {
int numElements = 67108864; //2^26
int rows = p.getIntParam("rows");
int cols = numElements / rows;
bool c = p.getIntParam("cf");
auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
x.push_back(arr);
z.push_back(arr2);
};
TransformBenchmark tb(transform::AnyOps::Assign, "assign");
output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign");
//Also test: NCHW to NHWC and back
BoolParameters nchw("nchw");
ParametersBatch batch2({&nchw});
auto generator2 = PARAMETRIC_XZ() {
bool nchw = p.getIntParam("nchw");
if(nchw) {
auto orig = NDArrayFactory::create_<float>('c', {16, 32, 64, 64});
orig->permutei({0,2,3,1});
x.push_back(orig);
z.push_back(NDArrayFactory::create_<float>('c', {16, 64, 64, 32}));
} else {
auto orig = NDArrayFactory::create_<float>('c', {16, 64, 64, 32});
orig->permutei({0,3,1,2});
x.push_back(orig);
z.push_back(NDArrayFactory::create_<float>('c', {16, 32, 64, 64}));
}
};
TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign");
return output;
}
static std::string broadcastOpsMatrixBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Broadcast ops: matrices for rank 3, 4, 5
for( int rank=3; rank <= broadcastMatrixRankLimit; rank++ ){
int numAxisTests = -1;
if(rank == 3){
numAxisTests = 3;
} else if(rank == 4){
numAxisTests = 6;
} else if(rank == 5){
numAxisTests = 10;
}
IntParameters testNum("testNum", 0,numAxisTests-1,1);
ParametersBatch b({&testNum});
auto generator = PARAMETRIC_D(){
int n = p.getIntParam("testNum");
std::vector<int> axis({});
switch(n){
//rank 3+
case 0:
axis = std::vector<int>({0,1});
break;
case 1:
axis = std::vector<int>({0,2});
break;
case 2:
axis = std::vector<int>({1,2});
break;
//rank 4+
case 3:
axis = std::vector<int>({0,3});
break;
case 4:
axis = std::vector<int>({1,3});
break;
case 5:
axis = std::vector<int>({2,3});
break;
//Rank 5
case 6:
axis = std::vector<int>({0,4});
break;
case 7:
axis = std::vector<int>({1,4});
break;
case 8:
axis = std::vector<int>({2,4});
break;
case 9:
axis = std::vector<int>({3,4});
break;
}
std::vector<Nd4jLong> shape({});
std::vector<Nd4jLong> toBcShape({});
int vectorLength;
if(rank == 3){
shape = std::vector<Nd4jLong>({64,64,64});
toBcShape = std::vector<Nd4jLong>({64,64,64});
vectorLength = 64;
} else if(rank == 4){
shape = std::vector<Nd4jLong>({32,32,32,32});
toBcShape = std::vector<Nd4jLong>({32,32,32,32});
vectorLength = 32;
} else if(rank == 5){
shape = std::vector<Nd4jLong>({16,16,16,16,16});
toBcShape = std::vector<Nd4jLong>({16,16,16,16,16});
vectorLength = 16;
}
for( int i=0; i<rank; i++ ){
if(axis[0] == i || axis[1] == i){
continue;
}
toBcShape[i] = 1;
}
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', shape), true);
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', toBcShape), true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', shape), true);
return ctx;
};
std::string name;
name += "Broadcast Matrix Add (Custom) - Rank";
name += std::to_string(rank);
sd::ops::add op;
DeclarableBenchmark benchmark(op, "add");
output += helper.runOperationSuit(&benchmark, generator, b, name.c_str());
}
return output;
}
static std::string broadcast2dBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
PredefinedParameters rows("rows", {65536});
IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^2, 2^6, 2^10
BoolParameters axis("axis");
BoolParameters inplace("inplace");
ParametersBatch batch({&rows, &cols, &axis, &inplace});
auto generator = PARAMETRIC_D() {
auto a = p.getIntParam("axis");
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
auto ctx = new Context(1);
ctx->setInputArray(0, arr, true);
if(a == 0){
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), 1}), true);
} else {
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
}
if (p.getIntParam("inplace") == 1) {
ctx->setOutputArray(0, arr);
ctx->markInplace(true);
} else {
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}), true);
}
return ctx;
};
std::string s("add");
sd::ops::add op;
DeclarableBenchmark benchmark(op, "add");
output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
return output;
}
static std::string broadcastBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
//Broadcast ops: vectors for rank 2, 3, 4, 5
for( int axis=0; axis<=1; axis++ ){
PredefinedParameters rows("rows", {65536});
IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
BoolParameters inplace("inplace");
ParametersBatch batch({&rows, &cols, &inplace});
auto generator = PARAMETRIC_XYZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
x.push_back(arr);
if(axis == 0){
y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows")}));
} else {
y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("cols")}));
}
if (p.getIntParam("inplace") == 1) {
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
}
};
std::string s("bAdd"); s += std::to_string(axis); s += "r2";
BroadcastBenchmark bAdd(broadcast::Add, s, {axis});
output += helper.runOperationSuit(&bAdd, generator, batch, "Broadcast Add - Rank 2");
}
for( int rank=3; rank<=5; rank++ ){
for( int axis=1; axis<rank; axis++ ){
std::vector<Nd4jLong> shape({});
int vectorLength;
if(rank == 3){
shape = std::vector<Nd4jLong>({32,128,128});
vectorLength = 128;
} else if(rank == 4){
shape = std::vector<Nd4jLong>({16,64,64,64});
vectorLength = 64;
} else if(rank == 5){
shape = std::vector<Nd4jLong>({16,48,48,48,48});
vectorLength = 48;
}
ParametersBatch batch({});
//Note: always inplace here
auto generator = PARAMETRIC_XYZ() {
auto arr = NDArrayFactory::create_<float>('c', shape);
x.push_back(arr);
y.push_back(NDArrayFactory::create_<float>('c', {vectorLength}));
z.push_back(arr);
};
std::string name("bArr-r"); name += std::to_string(rank); name += "a"; name += std::to_string(axis);
BroadcastBenchmark bAdd(broadcast::Add, name, {axis});
std::string n2("Broadcast Add - Rank"); n2 += std::to_string(rank); n2 += " - axis="; n2 += std::to_string(axis);
output += helper.runOperationSuit(&bAdd, generator, batch, n2.c_str());
}
}
return output;
}
static std::string fastStridedReductionNonEws() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters stride("stride", 2, 0, 10, 2); //2^0=1, ..., 2^10=1024
ParametersBatch batch({&stride});
//This is an edge case: technically an EWS *should* be available here
auto generator1 = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {131072 + (stride == 1 ? 0 : 1), stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
IndicesList indices({NDIndex::interval(0,131072), NDIndex::interval(0,1)});
strided = new NDArray(arr->subarray(indices)); //All rows, first column
delete arr;
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator1), batch, "Strided Sum - No EWS Test 1");
//No EWS defined for this case
auto generator2 = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
IndicesList indices({NDIndex::interval(0,2*1024,2), NDIndex::all(), NDIndex::interval(0,1)});
strided = new NDArray(arr->subarray(indices));
delete arr;
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS");
output += helper.runOperationSuit(&rbSum2, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator2), batch, "Strided Sum - No EWS Test 2");
return output;
}
static std::string fastStridedReductionIrregular() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4
PredefinedParameters stride("stride", {26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028});
ParametersBatch batch({&length, &stride});
auto generator = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
IndicesList indices({NDIndex::all(), NDIndex::interval(0,1)});
strided = new NDArray(arr->subarray(indices)); //All rows, first column
delete arr;
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Irregular Strides");
return output;
}
static std::string fastStridedReductionsRegular() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4
IntPowerParameters stride("stride", 2, 0, 10); //2^0=1, ..., 2^10=1024
ParametersBatch batch({&length, &stride});
auto generator = PARAMETRIC_XYZ() {
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
IndicesList indices({NDIndex::all(), NDIndex::point(0)});
strided = new NDArray(arr->subarray(indices)); //All rows, first column
delete arr;
}
strided->assign(1.0);
x.push_back(strided);
y.push_back(nullptr);
// z.push_back(NDArrayFactory::create_<float>(0.0f));
z.push_back(NDArrayFactory::create_<float>('c', {1}));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "Strided Sum");
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Regular Strides (powers of 2)");
auto generator3 = PARAMETRIC_D(){
auto ctx = new Context(1);
auto stride = p.getIntParam("stride");
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
NDArray* strided;
if(stride == 1){
strided = arr;
} else {
IndicesList indices({NDIndex::all(), NDIndex::point(0)});
strided = new NDArray(arr->subarray(indices)); //All rows, first column
delete arr;
}
strided->assign(1.0);
ctx->setInputArray(0, strided, true);
ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {1}), true);
auto iargs = new Nd4jLong[1];
iargs[0] = 0;
ctx->setIArguments(iargs, 1);
delete[] iargs;
return ctx;
};
sd::ops::argmax opArgmax;
DeclarableBenchmark dbArgmax(opArgmax, "stridedArgmax");
output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Strided Argmax");
return output;
}
static std::string fastReduceAlongDimBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
int length[] = {1024*1024, 64*1024*1024};
int powLimit[] = {10, 20, 26};
int powStep[] = {2, 2, 4};
for( int i=0; i < limit3; i++ ){
IntPowerParameters rows("rows", 2, 0, powLimit[i], powStep[i]);
BoolParameters dim("dim");
ParametersBatch batch({&rows, &dim});
auto generator = PARAMETRIC_XYZ() {
int rows = p.getIntParam("rows");
int cols = length[i] / rows;
int dim = p.getIntParam("dim");
auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
x.push_back(arr);
y.push_back(NDArrayFactory::create_<Nd4jLong>(dim));
NDArray* result;
if(dim == 0){
result = NDArrayFactory::create_<float>('c', {cols});
} else {
result = NDArrayFactory::create_<float>('c', {rows});
}
z.push_back(result);
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
std::string s1("Sum Along Dimension - ");
s1 += std::to_string(length[i]);
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
auto generator3 = PARAMETRIC_D(){
auto ctx = new Context(1);
int rows = p.getIntParam("rows");
int cols = length[i] / rows;
int dim = p.getIntParam("dim");
auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
Nd4jLong* dimArg = new Nd4jLong[1];
dimArg[0] = dim;
ctx->setIArguments(dimArg, 1);
delete[] dimArg;
ctx->setInputArray(0, arr, true);
NDArray* result;
if(dim == 0){
result = NDArrayFactory::create_<Nd4jLong>('c', {cols});
} else {
result = NDArrayFactory::create_<Nd4jLong>('c', {rows});
}
ctx->setOutputArray(0, result, true);
return ctx;
};
std::string s5("Argmax Along Dimension - ");
s5 += std::to_string(length[i]);
sd::ops::argmax opArgmax;
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
}
return output;
}
static std::string fastReduceToScalarBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, reduceScalarPowLimit, 4); //2^10 to 2^26 in steps of 4
ParametersBatch batch({&length});
auto generator = PARAMETRIC_XYZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
x.push_back(arr);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<float>(0.0f));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
//Index reduction
sd::ops::argmax opArgmax;
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
auto generator3 = PARAMETRIC_D(){
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong>((Nd4jLong)0), true);
ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>(0), true);
return ctx;
};
output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
return output;
}
static std::string fastNonEwsTransformBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters rowcol("rowcol", 2, 2, nonEwsPowLimit, 4); //2^2 to 2^14 in steps of 4 -> non-inplace case: 2x 2^10 x 2^10 = 128mb
BoolParameters inplace("inplace");
ParametersBatch batch({&rowcol, &inplace});
auto generator = PARAMETRIC_XZ() {
int r = p.getIntParam("rowcol");
auto arr = NDArrayFactory::create_<float>('c', {r, r+1});
IndicesList indices({NDIndex::all(), NDIndex::interval(0,r-1)});
auto view = new NDArray(arr->subarray(indices));
//nd4j_printf("VIEW ARRAY: rows=%lld, columns=%lld", view->sizeAt(0), view->sizeAt(1));
x.push_back(view);
if(p.getIntParam("inplace") == 1){
z.push_back(view);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {view->sizeAt(0),view->sizeAt(1)}));
}
delete arr;
};
ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU_View");
sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
TransformBenchmark tbExp(transform::StrictOps::Exp, "exp view");
output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU View");
output += helper.runOperationSuit(&tbExp, generator, batch, "Exp View");
return output;
}
static std::string fastPairwiseBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, pairwisePowLimit, 4); //2^10 to 2^26 in steps of 4 -> max is 512mb
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XYZ() {
auto arr1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
auto arr2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
x.push_back(arr1);
y.push_back(arr2);
if(p.getIntParam("inplace") == 1){
z.push_back(arr1);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");
PairwiseBenchmark pb2(pairwise::Ops::Add, "Multiply");
output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Multiply");
return output;
}
static std::string heavyTransformsBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22, steps of 4
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if (p.getIntParam("inplace") == 1) {
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
//Ops to test: erf (transform), betainc (custom), polygamma, synthetic ops?
TransformBenchmark erf(transform::StrictOps::Erf, "Erf");
output += helper.runOperationSuit(&erf, generator, batch, "Error Function (Erf)");
ParametersBatch batch2({&length});
sd::ops::polygamma op1;
DeclarableBenchmark pg(op1, "polygamma");
auto generator2 = PARAMETRIC_D() {
auto ctx = new Context(1);
auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
in0->assign(0.25);
auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
in1->assign(0.5);
ctx->setInputArray(0, in0, true);
ctx->setInputArray(1, in1, true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
return ctx;
};
IntPowerParameters lengthBetaInc("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22 in steps of 4
ParametersBatch batch3({&lengthBetaInc});
sd::ops::betainc op2;
DeclarableBenchmark binc(op2, "betainc");
auto generator3 = PARAMETRIC_D() {
auto ctx = new Context(1);
auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
in0->assign(0.25);
auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
in1->assign(0.5);
auto in2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
in2->assign(0.75);
ctx->setInputArray(0, in0, true);
ctx->setInputArray(1, in1, true);
ctx->setInputArray(2, in2, true);
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
return ctx;
};
output += helper.runOperationSuit(&pg, generator2, batch2, "PolyGamma Function");
output += helper.runOperationSuit(&binc, generator3, batch3, "Incomplete Beta Function (BetaInc)");
return output;
}
static std::string intermediateTransformsBenchmark() {
std::string output;
//Non-inplace: 2x 2^26 elements FP32 -> 512MB
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, intermediateTransformPowLimit, 4); //2^20 to 2^22 in steps of 4
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
TransformBenchmark tbTanh(transform::StrictOps::Tanh, "tanh");
TransformBenchmark tbGelu(transform::StrictOps::GELU, "gelu");
output += helper.runOperationSuit(&tbTanh, generator, batch, "Tanh");
output += helper.runOperationSuit(&tbGelu, generator, batch, "gelu");
//2x 1024 cols x 2^18 = 2GB
IntPowerParameters rows("rows", 2, 10, intermediateTransformPowLimit2, 4);
PredefinedParameters cols("cols", {4, 128, 1024});
ParametersBatch batch2({&rows, &cols, &inplace});
auto generator2 = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
}
};
//TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");
//output += helper.runOperationSuit(&tbSoftmax, generator2, batch2, "Softmax");
return output;
}
static std::string fastTransformsBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, transformBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU");
sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
TransformBenchmark tbAbs(transform::SameOps::Abs, "abs");
TransformBenchmark tbExp(transform::StrictOps::Exp, "exp");
output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU");
output += helper.runOperationSuit(&tbAbs, generator, batch, "Abs");
output += helper.runOperationSuit(&tbExp, generator, batch, "Exp");
return output;
}
static std::string fastScalarBenchmark() {
std::string output;
BenchmarkHelper helper(wIterations, rIterations);
IntPowerParameters length("length", 2, 10, scalarBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
}
};
ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");
sbAdd.setY(NDArrayFactory::create_<float>(3.14159265359));
sbDiv.setY(NDArrayFactory::create_<float>(3.14159265359));
sbPow.setY(NDArrayFactory::create_<float>(3.14159265359));
output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359) - F32");
output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359) - F32");
output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359) - F32");
return output;
}
static long nowMs(){
auto s = std::chrono::system_clock::now().time_since_epoch();
auto v = std::chrono::duration_cast<std::chrono::milliseconds>(s).count();
return v;
}
static long duration(long start){
return nowMs() - start;
}
static long done(long start){
long dur = duration(start);
nd4j_printf("Done: %i ms\n", dur);
return nowMs();
}
std::string FullBenchmarkSuit::runSuit() {
std::string result;
long start = nowMs();
// set 1
nd4j_printf("Running FullBenchmarkSuite.fastScalarBenchmark\n", "");
result += fastScalarBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastTransformsBenchmark\n", "");
result += fastTransformsBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.intermediateTransformsBenchmark\n", "");
result += intermediateTransformsBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastPairwiseBenchmark\n", "");
result += fastPairwiseBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.heavyTransformsBenchmark\n", "");
result += heavyTransformsBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastNonEwsTransformBenchmark\n", "");
result += fastNonEwsTransformBenchmark();
start = done(start);
// set 2
nd4j_printf("Running FullBenchmarkSuite.fastReduceToScalarBenchmark\n", "");
result += fastReduceToScalarBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastReduceAlongDimBenchmark\n", "");
result += fastReduceAlongDimBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionsRegular\n", "");
result += fastStridedReductionsRegular();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionIrregular\n", "");
result += fastStridedReductionIrregular();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionNonEws\n", "");
result += fastStridedReductionNonEws();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.broadcastBenchmark\n", "");
result += broadcastBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.broadcast2dBenchmark\n", "");
result += broadcast2dBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.broadcastOpsMatrixBenchmark\n", "");
result += broadcastOpsMatrixBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.mismatchedOrdersAssignBenchmark\n", "");
result += mismatchedOrdersAssignBenchmark();
start = done(start);
// set 3
nd4j_printf("Running FullBenchmarkSuite.gatherOpBenchmark\n", "");
result += gatherOpBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.scatterOpBenchmark\n", "");
result += scatterOpBenchmark();
start = done(start);
// set 4
nd4j_printf("Running FullBenchmarkSuite.gemmRegularBenchmark\n", "");
result += gemmRegularBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.gemmIrregularBenchmark\n", "");
result += gemmIrregularBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.rngBenchmark\n", "");
result += rngBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.conv2dBenchmark\n", "");
result += conv2dBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.pool2dBenchmark\n", "");
result += pool2dBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.batchnormBenchmark\n", "");
result += batchnormBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.lstmBenchmark\n", "");
result += lstmBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.conv3dBenchmark\n", "");
result += conv3dBenchmark();
start = done(start);
nd4j_printf("Running FullBenchmarkSuite.maxPool3DBenchmark\n", "");
result += maxPool3DBenchmark();
start = done(start);
// nd4j_printf("Running FullBenchmarkSuite.layerNormBenchmark\n", "");
// result += layerNormBenchmark();
// start = done(start);
return result;
}
}