2019-06-06 14:21:15 +02:00
/*******************************************************************************
* Copyright ( c ) 2015 - 2018 Skymind , Inc .
*
* This program and the accompanying materials are made available under the
* terms of the Apache License , Version 2.0 which is available at
* https : //www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS , WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied . See the
* License for the specific language governing permissions and limitations
* under the License .
*
* SPDX - License - Identifier : Apache - 2.0
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
//
// Created by raver119 on 20.11.17.
//
# include "testlayers.h"
# include <Graph.h>
# include <chrono>
# include <Node.h>
# include <ops/declarable/CustomOperations.h>
# include <graph/profiling/GraphProfilingHelper.h>
# include <type_conversions.h>
# include <helpers/threshold.h>
# include <helpers/MmulHelper.h>
# include <ops/ops.h>
# include <OmpLaunchHelper.h>
# include <GradCheck.h>
# include <ops/declarable/helpers/im2col.h>
# include <Loops.h>
# include <helpers/BenchmarkHelper.h>
# include <ops/declarable/helpers/scatter.h>
# include <helpers/ConstantShapeHelper.h>
# include <helpers/ConstantTadHelper.h>
# include <array>
using namespace nd4j ;
using namespace nd4j : : graph ;
class PlaygroundTests : public testing : : Test {
public :
int numIterations = 3 ;
int poolSize = 10 ;
PlaygroundTests ( ) {
printf ( " \n " ) ;
fflush ( stdout ) ;
}
} ;
/*
TEST_F ( PlaygroundTests , LSTMBenchmarks_DebugTNS ) {
BenchmarkHelper helper ( 5 , 10 ) ;
PredefinedParameters mb ( " mb " , { 1 , 8 , 64 } ) ;
PredefinedParameters nInOut ( " nInOut " , { 32 , 256 , 1024 } ) ;
ParametersBatch batch ( { & mb , & nInOut } ) ;
nd4j : : ops : : lstmBlock lstmBlock ;
DeclarableBenchmark benchmark ( lstmBlock , " lstm " ) ;
int seqLength = 64 ;
auto generator = PARAMETRIC_D ( ) {
auto ctx = new Context ( 1 ) ;
int m = p . getIntParam ( " mb " ) ;
int n = p . getIntParam ( " nInOut " ) ;
Nd4jLong l = 0 ;
ctx - > setInputArray ( 0 , NDArrayFactory : : create_ < Nd4jLong > ( l ) ) ; //Max TS length (unused)
//TNS format
ctx - > setInputArray ( 1 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //x
ctx - > setOutputArray ( 0 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //i
ctx - > setOutputArray ( 1 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //c
ctx - > setOutputArray ( 2 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //f
ctx - > setOutputArray ( 3 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //o
ctx - > setOutputArray ( 4 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //z
ctx - > setOutputArray ( 5 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //h
ctx - > setOutputArray ( 6 , NDArrayFactory : : create_ < float > ( ' c ' , { seqLength , m , n } ) ) ; //y
auto cLast = NDArrayFactory : : create_ < float > ( ' c ' , { m , n } ) ;
auto yLast = NDArrayFactory : : create_ < float > ( ' c ' , { m , n } ) ;
auto W = NDArrayFactory : : create_ < float > ( ' c ' , { 2 * n , 4 * n } ) ;
auto Wci = NDArrayFactory : : create_ < float > ( ' c ' , { n } ) ;
auto Wcf = NDArrayFactory : : create_ < float > ( ' c ' , { n } ) ;
auto Wco = NDArrayFactory : : create_ < float > ( ' c ' , { n } ) ;
auto b = NDArrayFactory : : create_ < float > ( ' c ' , { 4 * n } ) ;
ctx - > setInputArray ( 2 , cLast ) ;
ctx - > setInputArray ( 3 , yLast ) ;
ctx - > setInputArray ( 4 , W ) ;
ctx - > setInputArray ( 5 , Wci ) ;
ctx - > setInputArray ( 6 , Wcf ) ;
ctx - > setInputArray ( 7 , Wco ) ;
ctx - > setInputArray ( 8 , b ) ;
Nd4jLong * iargs = new Nd4jLong [ 2 ] ;
iargs [ 0 ] = 0 ; //No peephole
iargs [ 1 ] = 0 ; //TNS
ctx - > setIArguments ( iargs , 2 ) ;
delete [ ] iargs ;
double * targs = new double [ 2 ] ;
targs [ 0 ] = 1.0 ; //forget bias
targs [ 1 ] = 0.0 ; //cell clipping value
ctx - > setTArguments ( targs , 2 ) ;
delete [ ] targs ;
return ctx ;
} ;
helper . runOperationSuit ( & benchmark , generator , batch , " LSTMBlock " ) ;
}
TEST_F ( PlaygroundTests , BroadcastOps2d ) {
BenchmarkHelper helper ;
PredefinedParameters rows ( " rows " , { 1024 , 1048576 } ) ;
IntPowerParameters cols ( " cols " , 2 , 2 , 10 , 2 ) ; //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
BoolParameters axis ( " axis " ) ;
BoolParameters inplace ( " inplace " ) ;
ParametersBatch batch ( { & rows , & cols , & axis , & inplace } ) ;
auto generator = PARAMETRIC_D ( ) {
nd4j_printf ( " Entered generator \n " , " " ) ;
auto a = p . getIntParam ( " axis " ) ;
auto arr = NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " rows " ) , p . getIntParam ( " cols " ) } ) ;
nd4j_printf ( " Created first array: [%lld, %lld] \n " , arr - > sizeAt ( 0 ) , arr - > sizeAt ( 1 ) ) ;
auto ctx = new Context ( 1 ) ;
ctx - > setInputArray ( 0 , arr , true ) ;
if ( a = = 0 ) {
ctx - > setInputArray ( 1 , NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " rows " ) , 1 } ) , true ) ;
nd4j_printf ( " Created second array (a=0): [%lld, %lld] \n " , ctx - > getNDArray ( 1 ) - > sizeAt ( 0 ) , ctx - > getNDArray ( 1 ) - > sizeAt ( 1 ) ) ;
} else {
ctx - > setInputArray ( 1 , NDArrayFactory : : create_ < float > ( ' c ' , { 1 , p . getIntParam ( " cols " ) } ) , true ) ;
nd4j_printf ( " Created second array (a=1): [%lld, %lld] \n " , ctx - > getNDArray ( 1 ) - > sizeAt ( 0 ) , ctx - > getNDArray ( 1 ) - > sizeAt ( 1 ) ) ;
}
if ( p . getIntParam ( " inplace " ) = = 1 ) {
ctx - > setOutputArray ( 0 , arr , false ) ;
ctx - > markInplace ( true ) ;
nd4j_printf ( " Set result array (inplace) \n " , " " ) ;
} else {
auto out = NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " rows " ) , p . getIntParam ( " cols " ) } ) ;
ctx - > setOutputArray ( 0 , out , true ) ;
nd4j_printf ( " Created and set result array (not inplace): [%lld, %lld] \n " , out - > sizeAt ( 0 ) , out - > sizeAt ( 1 ) ) ;
}
return ctx ;
} ;
std : : string s ( " add " ) ;
nd4j : : ops : : add op ;
DeclarableBenchmark benchmark ( op , " add " ) ;
nd4j_printf ( " About to execute \n " , " " ) ;
helper . runOperationSuit ( & benchmark , generator , batch , " Broadcast (Custom) Add - 2d " ) ;
}
*/
TEST_F ( PlaygroundTests , test_small_reductions ) {
auto f = NDArrayFactory : : create < float > ( ' c ' , { 1024 , 1024 } ) ;
f . assign ( 1.0f ) ;
int iterations = 1 ;
std : : vector < Nd4jLong > results ( iterations ) ;
Nd4jLong mean = 0L ;
Nd4jLong max = 0L ;
Nd4jLong min = DataTypeUtils : : max < Nd4jLong > ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto x = NDArrayFactory : : create < float > ( ' c ' , { 4 , 64 } ) ;
auto z = NDArrayFactory : : create < float > ( ' c ' , { 64 } ) ;
x . assign ( 1.0f ) ;
int axis = 0 ;
auto tadPack = nd4j : : ConstantTadHelper : : getInstance ( ) - > tadForDimensions ( x . shapeInfo ( ) , axis ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
NativeOpExecutioner : : execReduceFloat ( nd4j : : LaunchContext : : defaultContext ( ) , reduce : : Mean , x . buffer ( ) , x . shapeInfo ( ) , x . specialBuffer ( ) , x . specialShapeInfo ( ) , nullptr , z . buffer ( ) , z . shapeInfo ( ) , z . specialBuffer ( ) , z . specialShapeInfo ( ) , & axis , 1 , tadPack . primaryShapeInfo ( ) , tadPack . primaryOffsets ( ) ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
results [ e ] = duration ;
mean + = duration ;
if ( duration > max )
max = duration ;
if ( duration < min )
min = duration ;
}
mean / = iterations ;
std : : sort ( results . begin ( ) , results . end ( ) ) ;
nd4j_printf ( " Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld] \n " , results [ results . size ( ) / 2 ] , mean , min , max ) ;
}
TEST_F ( PlaygroundTests , Test_PermutedArray_Operation_1 ) {
auto x = NDArrayFactory : : create < float > ( ' c ' , { 64 , 32 , 4 , 32 } ) ;
auto z = NDArrayFactory : : create < float > ( ' c ' , { 4 , 64 , 32 , 32 } ) ;
x . assign ( 1.0f ) ;
x . permutei ( { 2 , 0 , 3 , 1 } ) ;
//x.printShapeInfo("x");
int iterations = 1 ;
std : : vector < Nd4jLong > results ( iterations ) ;
Nd4jLong mean = 0L ;
Nd4jLong max = 0L ;
Nd4jLong min = DataTypeUtils : : max < Nd4jLong > ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
NativeOpExecutioner : : execTransformStrict ( LaunchContext : : defaultContext ( ) , transform : : StrictOps : : Sin , x . buffer ( ) , x . shapeInfo ( ) , x . specialBuffer ( ) , x . specialShapeInfo ( ) , z . buffer ( ) , z . shapeInfo ( ) , z . specialBuffer ( ) , z . specialShapeInfo ( ) , nullptr , nullptr , nullptr ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
results [ e ] = duration ;
mean + = duration ;
if ( duration > max )
max = duration ;
if ( duration < min )
min = duration ;
}
mean / = iterations ;
std : : sort ( results . begin ( ) , results . end ( ) ) ;
nd4j_printf ( " Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld] \n " , results [ results . size ( ) / 2 ] , mean , min , max ) ;
}
TEST_F ( PlaygroundTests , Test_PermutedArray_Operation_2 ) {
//x.printShapeInfo("x");
int iterations = 100 ;
std : : vector < Nd4jLong > results ( iterations ) ;
Nd4jLong mean = 0L ;
Nd4jLong max = 0L ;
Nd4jLong min = DataTypeUtils : : max < Nd4jLong > ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
Nd4jLong eShapeInfo [ ] = { 2 , 8 , 256 , 256 , 1 , 8192 , 1 , 99 } ;
Nd4jLong xShapeInfo [ ] = { 2 , 8 , 256 , 1024 , 1 , 8192 , 0 , 99 } ;
Nd4jLong yShapeInfo [ ] = { 2 , 8 , 256 , 256 , 1 , 8192 , 1 , 99 } ;
float xBuff [ 8 * 1024 ] ;
NDArray x ( xBuff , xShapeInfo ) ;
//NDArray x(eShapeInfo, nd4j::DataType::FLOAT32, true);
NDArray z ( yShapeInfo , nd4j : : DataType : : FLOAT32 , true ) ;
x . linspace ( 0.1f , 0.01f ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
NativeOpExecutioner : : execTransformStrict ( LaunchContext : : defaultContext ( ) , transform : : StrictOps : : Tanh , x . buffer ( ) , x . shapeInfo ( ) , x . specialBuffer ( ) , x . specialShapeInfo ( ) , z . buffer ( ) , z . shapeInfo ( ) , z . specialBuffer ( ) , z . specialShapeInfo ( ) , nullptr , nullptr , nullptr ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
results [ e ] = duration ;
mean + = duration ;
if ( duration > max )
max = duration ;
if ( duration < min )
min = duration ;
}
mean / = iterations ;
std : : sort ( results . begin ( ) , results . end ( ) ) ;
nd4j_printf ( " Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld] \n " , results [ results . size ( ) / 2 ] , mean , min , max ) ;
}
TEST_F ( PlaygroundTests , test_reduce_3 ) {
// auto x = NDArrayFactory::create<float>('c', {4096, 8192});
// auto y = NDArrayFactory::create<float>('c', {8192});
// auto z = NDArrayFactory::create<float>('c', {4096});
auto x = NDArrayFactory : : create < float > ( ' c ' , { 16 , 32 } ) ;
auto y = NDArrayFactory : : create < float > ( ' c ' , { 32 } ) ;
auto z = NDArrayFactory : : create < float > ( ' c ' , { 16 } ) ;
auto dim = NDArrayFactory : : create < int > ( ' c ' , { 1 } , { 1 } ) ;
auto iterations = 100 ;
std : : vector < Nd4jLong > results ( iterations ) ;
Nd4jLong mean = 0L ;
Nd4jLong max = 0L ;
Nd4jLong min = DataTypeUtils : : max < Nd4jLong > ( ) ;
NativeOps nativeOps ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
nativeOps . execReduce3 ( nullptr , reduce3 : : CosineDistance , x . buffer ( ) , x . shapeInfo ( ) , x . specialBuffer ( ) ,
x . specialShapeInfo ( ) , nullptr , y . buffer ( ) , y . shapeInfo ( ) , y . specialBuffer ( ) ,
y . specialShapeInfo ( ) , z . buffer ( ) , z . shapeInfo ( ) , z . specialBuffer ( ) , z . specialShapeInfo ( ) ,
dim . buffer ( ) , dim . shapeInfo ( ) , dim . specialBuffer ( ) , dim . specialShapeInfo ( ) , nullptr ,
nullptr , nullptr , nullptr ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
results [ e ] = duration ;
mean + = duration ;
if ( duration > max )
max = duration ;
if ( duration < min )
min = duration ;
}
mean / = iterations ;
std : : sort ( results . begin ( ) , results . end ( ) ) ;
nd4j_printf ( " Median time: [%lld]; Mean time: [%lld]; Min time: [%lld]; Max time: [%lld] \n " , results [ results . size ( ) / 2 ] , mean , min , max ) ;
}
/*
TEST_F ( PlaygroundTests , Test_OpBenchmark_1 ) {
BenchmarkHelper helper ;
ScalarBenchmark sb1 ( scalar : : Add , " add " , NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 100 } ) , NDArrayFactory : : create_ < float > ( 1.0f ) , NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 100 } ) ) ;
ScalarBenchmark sb2 ( scalar : : Add , " add " , NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) , NDArrayFactory : : create_ < float > ( 1.0f ) , NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) ) ;
helper . runOperationSuit ( { & sb1 , & sb2 } , " ScalarAdd " ) ;
}
TEST_F ( PlaygroundTests , Test_OpBenchmark_2 ) {
BenchmarkHelper helper ;
Parameters parameters ;
parameters . addBoolParam ( " fOrder " , true ) ;
float scalar = 2.0f ;
auto fa = NDArrayFactory : : create < int > ( 1 ) ;
ScalarBenchmark sb ( scalar : : Multiply ) ;
// Y will be shared
sb . setY ( NDArrayFactory : : create_ < float > ( scalar ) ) ;
auto generator = GENERATE_XZ ( ) {
// operands go together line by line
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 100 } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 100 } ) ) ;
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) ) ;
// only share within single op call. do not cross share
auto shared = NDArrayFactory : : create_ < float > ( ' c ' , { 256 , 768 } ) ;
x . push_back ( shared ) ;
z . push_back ( shared ) ;
// using bool param here
if ( parameters . getBoolParam ( " fOrder " ) ) {
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 1000 , 1000 } ) ) ;
}
//another way to call inplace op
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 100 } ) ) ;
z . push_back ( nullptr ) ;
} ;
helper . runOperationSuit ( & sb , generator , " ScalarTest " ) ;
TransformBenchmark tb ( transform : : StrictOps : : Tanh , " tanh " ) ;
// we can use the same generator, since the same number of operands used
helper . runOperationSuit ( & tb , generator , " TransformTest " ) ;
PairwiseBenchmark pb ( pairwise : : Pow , " pow test " ) ;
auto generatorXYZ = GENERATE_XYZ ( ) {
x . push_back ( NDArrayFactory : : create_ < float > ( ' f ' , { 100 , 1000 } ) ) ;
y . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 1000 } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 1000 } ) ) ;
x . push_back ( NDArrayFactory : : create_ < float > ( ' f ' , { 100 , 1000 } ) ) ;
y . push_back ( NDArrayFactory : : create_ < float > ( ' f ' , { 100 , 1000 } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' f ' , { 100 , 1000 } ) ) ;
} ;
helper . runOperationSuit ( & pb , generatorXYZ , " PairwiseTest " ) ;
auto generatorReductionAxis = GENERATE_XYZ ( ) {
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 1000 } ) ) ;
// axis goes to y here
y . push_back ( NDArrayFactory : : create_ < int > ( 0 ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 1000 } ) ) ;
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 1000 } ) ) ;
y . push_back ( NDArrayFactory : : create_ < int > ( 1 ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 } ) ) ;
// scalar case
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { 100 , 1000 } ) ) ;
y . push_back ( nullptr ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( 0.0f ) ) ;
} ;
ReductionBenchmark rb ( reduce : : FloatOps : : Mean ) ;
helper . runOperationSuit ( & rb , ( const std : : function < void ( ResultSet & , ResultSet & , ResultSet & ) > ) ( generatorReductionAxis ) , " ReductionAlongDimensionTest " ) ;
}
TEST_F ( PlaygroundTests , Test_OpBenchmark_3 ) {
TransformBenchmark tb ( transform : : StrictOps : : Tanh , " tanh " ) ;
PredefinedParameters a ( " alpha " , { 2 , 3 , 4 } ) ;
PredefinedParameters b ( " beta " , { 9 , 15 , 27 } ) ;
ParametersBatch batch ( { & a , & b } ) ;
auto parameters = batch . parameters ( ) ;
ASSERT_EQ ( 9 , parameters . size ( ) ) ;
auto params_0 = parameters [ 0 ] ;
ASSERT_EQ ( 2 , params_0 . getIntParam ( " alpha " ) ) ;
ASSERT_EQ ( 9 , params_0 . getIntParam ( " beta " ) ) ;
auto params_1 = parameters [ 1 ] ;
ASSERT_EQ ( 2 , params_1 . getIntParam ( " alpha " ) ) ;
ASSERT_EQ ( 15 , params_1 . getIntParam ( " beta " ) ) ;
auto params_3 = parameters [ 3 ] ;
ASSERT_EQ ( 3 , params_3 . getIntParam ( " alpha " ) ) ;
ASSERT_EQ ( 9 , params_3 . getIntParam ( " beta " ) ) ;
}
TEST_F ( PlaygroundTests , Test_OpBenchmark_4 ) {
BenchmarkHelper helper ;
PairwiseBenchmark pb ( pairwise : : Ops : : Add , " PWT ADD " ) ;
TransformBenchmark tb ( transform : : StrictOps : : Tanh , " tanh " ) ;
ScalarBenchmark sb ( scalar : : Multiply ) ;
sb . setY ( NDArrayFactory : : create_ < float > ( 119.0f ) ) ;
PredefinedParameters a ( " alpha " , { 2 , 3 , 4 } ) ;
PredefinedParameters b ( " beta " , { 9 , 15 , 27 } ) ;
ParametersBatch batch ( { & a , & b } ) ;
auto generator = PARAMETRIC_XZ ( ) {
// operands go together line by line
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
} ;
auto generatorXYZ = PARAMETRIC_XYZ ( ) {
// operands go together line by line
x . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
y . push_back ( NDArrayFactory : : create_ < float > ( ' f ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
} ;
helper . runOperationSuit ( & tb , generator , batch , " TransformTanh " ) ;
helper . runOperationSuit ( & sb , generator , batch , " ScalarMultiply " ) ;
helper . runOperationSuit ( & pb , generatorXYZ , batch , " PairwiseAdd " ) ;
}
TEST_F ( PlaygroundTests , Test_OpBenchmark_5 ) {
BenchmarkHelper helper ;
TransformBenchmark tb ( transform : : StrictOps : : Sigmoid , " sigmoid " ) ;
IntParameters length ( " length " , 100 , 500 , 100 ) ;
BoolParameters inplace ( " inplace " ) ;
ParametersBatch batch ( { & length , & inplace } ) ;
auto generator = PARAMETRIC_XZ ( ) {
// operands go together line by line
auto arr = NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " length " ) } ) ;
x . push_back ( arr ) ;
if ( p . getIntParam ( " inplace " ) = = 1 ) {
z . push_back ( arr ) ;
} else {
z . push_back ( NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " length " ) } ) ) ;
}
} ;
helper . runOperationSuit ( & tb , generator , batch , " Transform_Sigmoid " ) ;
}
TEST_F ( PlaygroundTests , Test_Something_5 ) {
auto x = NDArrayFactory : : create < float > ( ' c ' , { 100 , 10 } ) ;
auto y = NDArrayFactory : : create < float > ( ' c ' , { 10 } ) ;
auto z = NDArrayFactory : : create < float > ( ' c ' , { 100 , 10 } ) ;
std : : vector < int > axis = { 1 } ;
NativeOpExcutioner : : execBroadcast ( broadcast : : Add , x . buffer ( ) , x . shapeInfo ( ) , y . buffer ( ) , y . shapeInfo ( ) , z . buffer ( ) , z . shapeInfo ( ) ,
axis . data ( ) , axis . size ( ) , nullptr , nullptr ,
nullptr , nullptr ) ;
}
# define PARAMETRIC_D() [&] (Parameters &p) -> Context*
/*
TEST_F ( PlaygroundTests , Test_OpBenchmark_6 ) {
BenchmarkHelper helper ;
nd4j : : ops : : softmax op ;
DeclarableBenchmark db ( op , " SoftMaxTest " ) ;
PredefinedParameters a ( " alpha " , { 128 , 256 } ) ;
PredefinedParameters b ( " beta " , { 1024 , 2048 } ) ;
ParametersBatch batch ( { & a , & b } ) ;
auto generator = PARAMETRIC_D ( ) {
auto ctx = new Context ( 1 ) ;
ctx - > setInputArray ( 0 , NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
ctx - > setOutputArray ( 0 , NDArrayFactory : : create_ < float > ( ' c ' , { p . getIntParam ( " alpha " ) , p . getIntParam ( " beta " ) } ) ) ;
return ctx ;
} ;
helper . runOperationSuit ( & db , generator , batch , " parametrized softmax test " ) ;
}
*/
/*
TEST_F ( PlaygroundTests , Test_Strided_Stuff ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 1048576 , 1024 } ) ;
auto strided = array ( { 0 , 0 , 3 , 4 } , true ) ;
auto z = NDArrayFactory : : create < float > ( 0.0f ) ;
//strided->shapeInfo()[shape::shapeInfoLength(strided->rankOf()) - 2] = 1024;
int N = 1000 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < N ; e + + )
NativeOpExcutioner : : execReduceSameScalar ( reduce : : ReduceSameBenchmarkOp , strided . buffer ( ) , strided . shapeInfo ( ) , nullptr , z . buffer ( ) , z . shapeInfo ( ) ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld us; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
}
*/
/*
TEST_F ( PlaygroundTests , StridedReductionsNoEWS ) {
nd4j_printf ( " SETTING ELEMENTWISE THRESHOLD AND TAD THRESHOLD TO 1/1 " , " " ) ;
nd4j : : Environment : : getInstance ( ) - > setElementwiseThreshold ( 1 ) ;
nd4j : : Environment : : getInstance ( ) - > setTadThreshold ( 1 ) ;
BenchmarkHelper helper ;
IntPowerParameters stride ( " stride " , 2 , 0 , 10 ) ; //2^0=1, ..., 2^10=1024
ParametersBatch batch ( { & stride } ) ;
//This is an edge case: technically an EWS *should* be available here
auto generator1 = PARAMETRIC_XYZ ( ) {
auto stride = p . getIntParam ( " stride " ) ;
auto arr = NDArrayFactory : : create_ < float > ( ' c ' , { 1048576 + ( stride = = 1 ? 0 : 1 ) , stride } ) ;
NDArray * strided ;
if ( stride = = 1 ) {
strided = arr ;
} else {
strided = new NDArray ( ( * arr ) ( { 0 , 1048576 , 0 , 1 } , true ) ) ; //All rows, first column
}
strided - > assign ( 1.0 ) ;
x . push_back ( strided ) ;
y . push_back ( nullptr ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( 0.0f ) ) ;
} ;
ReductionBenchmark rbSum ( reduce : : SameOps : : Sum , " stridedSum " ) ;
helper . runOperationSuit ( & rbSum , ( const std : : function < void ( Parameters & , ResultSet & , ResultSet & , ResultSet & ) > ) ( generator1 ) , batch , " Strided Sum - No EWS Test 1 " ) ;
//No EWS defined for this case
auto generator2 = PARAMETRIC_XYZ ( ) {
auto stride = p . getIntParam ( " stride " ) ;
auto arr = NDArrayFactory : : create_ < float > ( ' c ' , { ( stride = = 1 ? 1 : 2 ) * 1024 , 1024 , stride } ) ;
NDArray * strided ;
if ( stride = = 1 ) {
strided = arr ;
} else {
strided = new NDArray ( ( * arr ) ( { 0 , 2 * 1024 , 2 , 0 , 0 , 0 , 0 , 1 , 1 } , true , true ) ) ;
}
strided - > assign ( 1.0 ) ;
x . push_back ( strided ) ;
y . push_back ( nullptr ) ;
z . push_back ( NDArrayFactory : : create_ < float > ( 0.0f ) ) ;
} ;
ReductionBenchmark rbSum2 ( reduce : : SameOps : : Sum , " stridedSumNoEWS " ) ;
helper . runOperationSuit ( & rbSum2 , ( const std : : function < void ( Parameters & , ResultSet & , ResultSet & , ResultSet & ) > ) ( generator2 ) , batch , " Strided Sum - No EWS Test 2 " ) ;
}
*/
# ifndef __CUDABLAS__
TEST_F ( PlaygroundTests , LambdaTest_1 ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 8192 , 1024 } ) ;
array . linspace ( 1 ) ;
auto lambda = LAMBDA_F ( _x ) {
return _x + 32.12f ;
} ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < numIterations ; e + + ) {
array . applyLambda < float > ( lambda ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Lambda 1 time %lld us\n", outerTime / numIterations);
}
TEST_F ( PlaygroundTests , LambdaTest_2 ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 8192 , 1024 } ) ;
auto row = NDArrayFactory : : create < float > ( ' c ' , { 1 , 1024 } ) ;
array . linspace ( 1 ) ;
auto lambda = LAMBDA_F ( _x ) {
return _x + 32.12f ;
} ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < numIterations ; e + + ) {
array . applyBroadcast ( broadcast : : Add , { 1 } , & row ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Broadcast time %lld us\n", outerTime / numIterations);
}
TEST_F ( PlaygroundTests , NoCacheTest_1 ) {
std : : vector < NDArray * > pool ( poolSize ) ;
auto source = NDArrayFactory : : create < float > ( ' c ' , { 8192 , 1024 } ) ;
for ( int e = 0 ; e < pool . size ( ) ; e + + )
pool [ e ] = source . dup ( ) ;
auto lambda = LAMBDA_F ( _x ) {
return _x * 32.12f ;
} ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
int cnt = 0 ;
for ( int e = 0 ; e < numIterations ; e + + ) {
auto v = pool [ poolSize - 1 - ( cnt + + ) ] ;
v - > applyLambda < float > ( lambda ) ;
if ( cnt = = poolSize )
cnt = 0 ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Non-cached time %lld us\n", outerTime / numIterations);
for ( auto v : pool )
delete v ;
}
TEST_F ( PlaygroundTests , NoCacheTest_2 ) {
std : : vector < NDArray * > pool1 ( poolSize ) ;
std : : vector < NDArray * > pool2 ( poolSize ) ;
auto source = NDArrayFactory : : create < float > ( ' c ' , { 8192 , 1024 } ) ;
for ( int e = 0 ; e < pool1 . size ( ) ; e + + ) {
pool1 [ e ] = source . dup ( ) ;
pool2 [ e ] = source . dup ( ) ;
}
auto lambda = LAMBDA_FF ( _x , _y ) {
return _x * 32.12f + _y ;
} ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
int cnt = 0 ;
for ( int e = 0 ; e < numIterations ; e + + ) {
auto v1 = pool1 [ poolSize - 1 - cnt ] ;
auto v2 = pool2 [ cnt + + ] ;
v1 - > applyPairwiseLambda < float > ( v2 , lambda ) ;
if ( cnt = = poolSize )
cnt = 0 ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Non-cached PWT time %lld us\n", outerTime / numIterations);
for ( auto v : pool1 )
delete v ;
for ( auto v : pool2 )
delete v ;
}
# endif
TEST_F ( PlaygroundTests , ReductionTest_1 ) {
std : : vector < NDArray * > pool1 ( poolSize ) ;
std : : vector < NDArray * > pool2 ( poolSize ) ;
auto source = NDArrayFactory : : create < float > ( ' c ' , { 1 , 100 } ) ;
for ( int e = 0 ; e < pool1 . size ( ) ; e + + ) {
pool1 [ e ] = source . dup ( ) ;
pool2 [ e ] = source . dup ( ) ;
}
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
int cnt = 0 ;
for ( int e = 0 ; e < 1 ; e + + ) {
auto v = pool1 [ poolSize - 1 - cnt ] ;
auto r = v - > sumNumber ( ) ;
if ( cnt = = poolSize )
cnt = 0 ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( timeEnd - timeStart ) . count ( ) ;
auto outerTimeMs = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Non-cached reduction time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
for ( auto v : pool1 )
delete v ;
for ( auto v : pool2 )
delete v ;
}
TEST_F ( PlaygroundTests , ScalarTest_1 ) {
std : : vector < NDArray * > pool1 ( poolSize ) ;
std : : vector < NDArray * > pool2 ( poolSize ) ;
auto source = NDArrayFactory : : create < float > ( ' c ' , { 1 , 100 } ) ;
for ( int e = 0 ; e < pool1 . size ( ) ; e + + ) {
pool1 [ e ] = source . dup ( ) ;
pool2 [ e ] = source . dup ( ) ;
}
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
int cnt = 0 ;
float * buff = reinterpret_cast < float * > ( source . buffer ( ) ) ;
for ( int e = 0 ; e < 100 ; e + + ) {
//auto v = pool1[poolSize - 1 - cnt];
//v->template applyScalar<simdOps::Add<float>>(2.0f);
source . applyScalar ( scalar : : Add , 2.0f ) ;
//functions::scalar::ScalarTransform<float>::template transformEx<simdOps::Add<float>>(source.buffer(), 1, source.buffer(), 1, 2.0f, nullptr, source.lengthOf());
//functions::scalar::ScalarTransform<float>::template transform<simdOps::Add<float>>(buff, 1, buff, 1, 2.0f, nullptr, 100);
cnt + + ;
if ( cnt = = poolSize )
cnt = 0 ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( timeEnd - timeStart ) . count ( ) ;
auto outerTimeMs = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Cached scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000L, outerTimeMs);
for ( auto v : pool1 )
delete v ;
for ( auto v : pool2 )
delete v ;
}
TEST_F ( PlaygroundTests , ScalarTest_2 ) {
std : : vector < NDArray * > pool1 ( poolSize ) ;
std : : vector < NDArray * > pool2 ( poolSize ) ;
auto source = NDArrayFactory : : create < float > ( ' c ' , { 1 , 100 } ) ;
for ( int e = 0 ; e < pool1 . size ( ) ; e + + ) {
pool1 [ e ] = source . dup ( ) ;
pool2 [ e ] = source . dup ( ) ;
}
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
int cnt = 0 ;
float * array = reinterpret_cast < float * > ( source . buffer ( ) ) ;
for ( int e = 0 ; e < 1000 ; e + + ) {
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SIMD
2019-06-06 14:21:15 +02:00
for ( int i = 0 ; i < source . lengthOf ( ) ; i + + ) {
array [ i ] = simdOps : : Add < float , float , float > : : op ( array [ i ] , 2.0f ) ;
}
cnt + + ;
if ( cnt = = poolSize )
cnt = 0 ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( timeEnd - timeStart ) . count ( ) ;
auto outerTimeMs = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("Cached manual scalar time avg: %lld ns; Total time: %lld ms;\n", outerTime / 100000, outerTimeMs);
for ( auto v : pool1 )
delete v ;
for ( auto v : pool2 )
delete v ;
}
TEST_F ( PlaygroundTests , Test_Profile_1 ) {
GraphProfile prof ;
prof . setBuildTime ( 70 ) ;
prof . setExecutionTime ( 130 ) ;
prof . startEvent ( " omega " ) ;
prof . spotEvent ( " alpha " ) ;
prof . spotEvent ( " beta " ) ;
prof . spotEvent ( " gamma " ) ;
prof . recordEvent ( " omega " ) ;
auto nodeA = prof . nodeById ( 1 , " MatMul " ) ;
auto nodeB = prof . nodeById ( 2 , " Sum " ) ;
auto nodeC = prof . nodeById ( 3 , " Conv2D " ) ;
nodeA - > setObjectsSize ( 512 ) ;
nodeA - > setTemporarySize ( 65536 ) ;
nodeA - > setActivationsSize ( 512387 ) ;
nodeA - > setPreparationTime ( 127 ) ;
nodeA - > setExecutionTime ( 6539 ) ;
nodeB - > setObjectsSize ( 0 ) ;
nodeB - > setTemporarySize ( 0 ) ;
nodeB - > setActivationsSize ( 512387 ) ;
nodeB - > setPreparationTime ( 132 ) ;
nodeB - > setExecutionTime ( 2047 ) ;
nodeC - > setObjectsSize ( 1536 ) ;
nodeC - > setTemporarySize ( 2355674 ) ;
nodeC - > setActivationsSize ( 1022092 ) ;
nodeC - > setPreparationTime ( 129 ) ;
nodeC - > setExecutionTime ( 12983 ) ;
// prof.printOut();
}
# ifdef GRAPH_FILES_OK
TEST_F ( PlaygroundTests , Test_Profile_2 ) {
Environment : : getInstance ( ) - > setProfiling ( true ) ;
auto graph = GraphExecutioner : : importFromFlatBuffers ( " ./resources/ae_00.fb " ) ;
auto profile = GraphProfilingHelper : : profile ( graph , 2 ) ;
// profile->printOut();
delete graph ;
delete profile ;
}
# endif
TEST_F ( PlaygroundTests , Test_Im2Col_1 ) {
int bS = 16 , iH = 224 , iW = 224 , iC = 3 , oC = 3 , kH = 11 , kW = 11 , sH = 4 , sW = 4 , pH = 2 , pW = 2 , dH = 1 , dW = 1 ;
int oH = 55 , oW = 55 ;
int iterations = 1 ;
auto input = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , iH , iW } ) ;
auto output = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , kH , kW , oH , oW } ) ;
auto outputPermuted = NDArrayFactory : : create < float > ( ' c ' , { bS , oH , oW , iC , kH , kW } ) ;
outputPermuted . permutei ( { 0 , 3 , 4 , 5 , 1 , 2 } ) ;
nd4j : : ops : : im2col op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & input } , { & output } , { } , { kH , kW , sH , sW , pH , pW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// outputPermuted.printShapeInfo("permuted shape");
auto permStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & input } , { & outputPermuted } , { } , { kH , kW , sH , sW , pH , pW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto permEnd = std : : chrono : : system_clock : : now ( ) ;
auto permTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( permEnd - permStart ) . count ( ) ;
auto legacyStart = std : : chrono : : system_clock : : now ( ) ;
ExtraArguments extra ( { ( double ) kH , ( double ) kW , ( double ) sH , ( double ) sW , ( double ) pH , ( double ) pW , ( double ) dH , ( double ) dW , ( double ) 0.f , ( double ) 0.f } ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
input . applyTransform ( transform : : Im2col , & output , & extra ) ;
}
auto legacyEnd = std : : chrono : : system_clock : : now ( ) ;
auto legacyTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( legacyEnd - legacyStart ) . count ( ) ;
auto legacyPermStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
input . applyTransform ( transform : : Im2col , & outputPermuted , & extra ) ;
}
auto legacyPermEnd = std : : chrono : : system_clock : : now ( ) ;
auto legacyPermTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( legacyPermEnd - legacyPermStart ) . count ( ) ;
NativeOps nativeOps ;
Nd4jLong iArgs [ ] = { kH , kW , sH , sW , pH , pW , dH , dW , 0 } ;
Nd4jPointer inputBuffers [ ] = { input . buffer ( ) } ;
Nd4jPointer inputShapes [ ] = { input . shapeInfo ( ) } ;
Nd4jPointer outputBuffers [ ] = { output . buffer ( ) } ;
Nd4jPointer outputShapes [ ] = { output . shapeInfo ( ) } ;
auto javaStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
nativeOps . execCustomOp ( nullptr , op . getOpHash ( ) , inputBuffers , inputShapes , 1 , outputBuffers , outputShapes , 1 , nullptr , 0 , iArgs , 9 , nullptr , 0 , false ) ;
}
auto javaEnd = std : : chrono : : system_clock : : now ( ) ;
auto javaTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( javaEnd - javaStart ) . count ( ) ;
Nd4jPointer outputPermBuffers [ ] = { outputPermuted . buffer ( ) } ;
Nd4jPointer outputPermShapes [ ] = { outputPermuted . shapeInfo ( ) } ;
auto javaPermStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
nativeOps . execCustomOp ( nullptr , op . getOpHash ( ) , inputBuffers , inputShapes , 1 , outputPermBuffers , outputPermShapes , 1 , nullptr , 0 , iArgs , 9 , nullptr , 0 , false ) ;
}
auto javaPermEnd = std : : chrono : : system_clock : : now ( ) ;
auto javaPermTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( javaPermEnd - javaPermStart ) . count ( ) ;
// nd4j_printf("New time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
// nd4j_printf("Legacy time: %lld us;\n", legacyTime / iterations);
// nd4j_printf("Legacy Permuted time: %lld us;\n", legacyPermTime / iterations);
// nd4j_printf("Java time: %lld us;\n", javaTime / iterations);
// nd4j_printf("Java Permuted time: %lld us;\n", javaPermTime / iterations);
}
TEST_F ( PlaygroundTests , Test_Im2Col_2 ) {
auto input = NDArrayFactory : : create < float > ( ' c ' , { 16 , 3 , 224 , 224 } ) ;
auto output = NDArrayFactory : : create < float > ( ' c ' , { 16 , 3 , 11 , 11 , 55 , 55 } ) ;
auto outputPermuted = NDArrayFactory : : create < float > ( ' c ' , { 16 , 55 , 55 , 3 , 11 , 11 } ) ;
outputPermuted . permutei ( { 0 , 3 , 4 , 5 , 1 , 2 } ) ;
nd4j : : ops : : im2col op ;
Nd4jLong iArgs [ ] = { 11 , 11 , 4 , 4 , 2 , 2 , 1 , 1 , 0 } ;
Nd4jPointer inputBuffers [ ] = { input . buffer ( ) } ;
Nd4jPointer inputShapes [ ] = { input . shapeInfo ( ) } ;
Nd4jPointer outputPermBuffers [ ] = { outputPermuted . buffer ( ) } ;
Nd4jPointer outputPermShapes [ ] = { outputPermuted . shapeInfo ( ) } ;
NativeOps nativeOps ;
nativeOps . execCustomOp ( nullptr , op . getOpHash ( ) , inputBuffers , inputShapes , 1 , outputPermBuffers , outputPermShapes , 1 , nullptr , 0 , iArgs , 9 , nullptr , 0 , false ) ;
}
TEST_F ( PlaygroundTests , Test_Col2Im_1 ) {
int bS = 16 , iH = 224 , iW = 224 , iC = 3 , oC = 3 , kH = 11 , kW = 11 , sH = 4 , sW = 4 , pH = 2 , pW = 2 , dH = 1 , dW = 1 ;
int oH = 55 , oW = 55 ;
int iterations = 1 ;
auto input = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , kH , kW , oH , oW } ) ;
auto output = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , iH , iW } ) ;
auto inputPermuted = NDArrayFactory : : create < float > ( ' c ' , { bS , oH , oW , iC , kH , kW } ) ;
inputPermuted . permutei ( { 0 , 3 , 4 , 5 , 1 , 2 } ) ;
auto outputPermuted = NDArrayFactory : : create < float > ( ' c ' , { bS , iH , iW , iC } ) ;
outputPermuted . permutei ( { 0 , 3 , 1 , 2 } ) ;
input = 10. ;
output = 2. ;
inputPermuted = 10. ;
outputPermuted = 2. ;
nd4j : : ops : : col2im op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & input } , { & output } , { } , { sH , sW , pH , pW , iH , iW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
auto permStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & inputPermuted } , { & outputPermuted } , { } , { sH , sW , pH , pW , iH , iW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto permEnd = std : : chrono : : system_clock : : now ( ) ;
auto permTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( permEnd - permStart ) . count ( ) ;
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
}
TEST_F ( PlaygroundTests , Test_Im2Col_3 ) {
int bS = 16 , iH = 224 , iW = 224 , iC = 3 , oC = 3 , kH = 11 , kW = 11 , sH = 4 , sW = 4 , pH = 2 , pW = 2 , dH = 1 , dW = 1 ;
int oH = 55 , oW = 55 ;
int iterations = 1 ;
auto output = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , kH , kW , oH , oW } ) ;
auto input = NDArrayFactory : : create < float > ( ' c ' , { bS , iC , iH , iW } ) ;
auto outputPermuted = NDArrayFactory : : create < float > ( ' c ' , { bS , oH , oW , iC , kH , kW } ) ;
outputPermuted . permutei ( { 0 , 3 , 4 , 5 , 1 , 2 } ) ;
auto inputPermuted = NDArrayFactory : : create < float > ( ' c ' , { bS , iH , iW , iC } ) ;
inputPermuted . permutei ( { 0 , 3 , 1 , 2 } ) ;
input = 10. ;
output = 2. ;
inputPermuted = 10. ;
outputPermuted = 2. ;
nd4j : : ops : : im2col op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & input } , { & output } , { } , { kH , kW , sH , sW , pH , pW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto outerTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
auto permStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & inputPermuted } , { & outputPermuted } , { } , { kH , kW , sH , sW , pH , pW , dH , dW , 0 } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result ) ;
}
auto permEnd = std : : chrono : : system_clock : : now ( ) ;
auto permTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( permEnd - permStart ) . count ( ) ;
// nd4j_printf("C-order time: %lld us;\n", outerTime / iterations);
// nd4j_printf("Permuted time: %lld us;\n", permTime / iterations);
}
TEST_F ( PlaygroundTests , loop_test_1 ) {
if ( 1 > 0 )
return ;
auto f = NDArrayFactory : : create < float > ( ' c ' , { 2 } , { 5000 , 10000 } ) ;
nd4j : : ops : : randomuniform op ;
auto result = op . execute ( { & f } , { - 1.0f , 1.0f } , { } ) ;
ASSERT_EQ ( Status : : OK ( ) , result - > status ( ) ) ;
auto array = result - > at ( 0 ) ;
auto buffer = array - > buffer ( ) ;
int cnt = 0 ;
int iterations = 1 ;
//nd4j_printf("Array length: %lld\n", array->lengthOf());
int length = ( int ) array - > lengthOf ( ) ;
int span = ( int ) ( array - > lengthOf ( ) / 6 ) + 8 ;
NativeOps ops ;
auto t = new int [ 1000000 ] ;
FloatBits fb ;
float threshold = 0.99f ;
fb . f_ = threshold ;
int le = ops . estimateThreshold ( nullptr , reinterpret_cast < void * > ( array - > buffer ( ) ) , array - > shapeInfo ( ) , static_cast < int > ( array - > lengthOf ( ) ) , threshold ) ;
t [ 0 ] = le ;
t [ 1 ] = length ;
t [ 2 ] = fb . i_ ;
//nd4j_printf("number of elements: [%i]\n", le);
long permTime = 0 ;
for ( int x = 0 ; x < iterations ; x + + ) {
auto permStart = std : : chrono : : system_clock : : now ( ) ;
ops . estimateThreshold ( nullptr , reinterpret_cast < void * > ( array - > buffer ( ) ) , array - > shapeInfo ( ) , static_cast < int > ( array - > lengthOf ( ) ) , threshold ) ;
TypeCast : : convertToThreshold < float > ( nullptr , buffer , array - > lengthOf ( ) , t ) ;
auto permEnd = std : : chrono : : system_clock : : now ( ) ;
permTime + = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( permEnd - permStart ) . count ( ) ;
}
nd4j_printf ( " Permuted time: %lld us; Counter: %i; \n " , permTime / iterations , cnt ) ;
delete result ;
delete [ ] t ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , ndarray_tile_test1 ) {
auto x = NDArrayFactory : : create < float > ( ' c ' , { 20 , 30 } ) ;
auto exp = NDArrayFactory : : create < float > ( ' c ' , { 2 , 40 , 60 } ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
auto tiled = x . tile ( { 2 , 2 , 2 } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto time = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("c-order time: %d;\n", time);
ASSERT_TRUE ( tiled . isSameShape ( & exp ) ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , ndarray_tile_test2 ) {
auto x = NDArrayFactory : : create < float > ( ' f ' , { 20 , 30 } ) ;
auto exp = NDArrayFactory : : create < float > ( ' f ' , { 2 , 40 , 60 } ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
auto tiled = x . tile ( { 2 , 2 , 2 } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto time = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
// nd4j_printf("f-order time: %d;\n", time);
ASSERT_TRUE ( tiled . isSameShape ( & exp ) ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loopThroughArrs_test1 ) {
NDArray x ( ' c ' , { 20 , 30 , 40 } , nd4j : : DataType : : DOUBLE ) ;
NDArray y ( ' f ' , { 50 , 30 , 4 , 4 } , nd4j : : DataType : : DOUBLE ) ;
auto xBuff = x . bufferAsT < double > ( ) ;
auto yBuff = y . bufferAsT < double > ( ) ;
auto len = x . lengthOf ( ) ;
//***********************************
//***********************************
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_FOR_ARGS ( schedule ( guided ) )
2019-06-06 14:21:15 +02:00
for ( Nd4jLong i = 0 ; i < len ; + + i ) {
Nd4jLong offset1 = shape : : getIndexOffset ( i , x . getShapeInfo ( ) , len ) ;
Nd4jLong offset2 = shape : : getIndexOffset ( i , y . getShapeInfo ( ) , len ) ;
xBuff [ offset1 ] = yBuff [ offset2 ] ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto myTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
//***********************************
//***********************************
timeStart = std : : chrono : : system_clock : : now ( ) ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_FOR_ARGS ( schedule ( guided ) )
2019-06-06 14:21:15 +02:00
for ( Nd4jLong i = 0 ; i < len ; + + i ) {
Nd4jLong offset1 = shape : : getIndexOffset ( i , x . getShapeInfo ( ) , len ) ;
Nd4jLong offset2 = shape : : getIndexOffset ( i , y . getShapeInfo ( ) , len ) ;
xBuff [ offset1 ] = yBuff [ offset2 ] ;
}
timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto oldTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( timeEnd - timeStart ) . count ( ) ;
nd4j_printf ( " My time: %lld us; \n " , myTime ) ;
nd4j_printf ( " Old time: %lld us; \n " , oldTime ) ;
ASSERT_TRUE ( 1 ) ;
}
static void loopSpan ( float * x , Nd4jLong * xShapeInfo , float * y , Nd4jLong * yShapeInfo , float * z , Nd4jLong * zShapeInfo ) {
auto len = shape : : length ( xShapeInfo ) ;
int xEws = shape : : elementWiseStride ( xShapeInfo ) ;
int yEws = shape : : elementWiseStride ( yShapeInfo ) ;
int zEws = shape : : elementWiseStride ( zShapeInfo ) ;
BlockInformation info ( len , ELEMENT_THRESHOLD ) ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_ARGS ( num_threads ( info . threads ) if ( info . threads > 1 ) )
2019-06-06 14:21:15 +02:00
{
auto i = omp_get_thread_num ( ) ;
Nd4jLong itemsToLoop = ( i < info . threads - 1 ) ? info . items : info . items + info . remainder ;
Nd4jLong index = i * info . items ;
auto xi = x + xEws * index ;
auto yi = y + yEws * index ;
auto zi = z + zEws * index ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SIMD
2019-06-06 14:21:15 +02:00
for ( Nd4jLong j = 0 ; j < itemsToLoop ; j + + )
zi [ j * zEws ] = simdOps : : LogPoissonLoss < float , float , float > : : op ( xi [ j * xEws ] , yi [ j * yEws ] ) ;
}
}
static void loopSimple ( float * x , Nd4jLong * xShapeInfo , float * y , Nd4jLong * yShapeInfo , float * z , Nd4jLong * zShapeInfo ) {
auto len = shape : : length ( xShapeInfo ) ;
int xEws = shape : : elementWiseStride ( xShapeInfo ) ;
int yEws = shape : : elementWiseStride ( yShapeInfo ) ;
int zEws = shape : : elementWiseStride ( zShapeInfo ) ;
int threads = 6 ;
int span_size = len / threads + 1 ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS ( schedule ( static , span_size ) if ( len > ELEMENT_THRESHOLD ) proc_bind ( close ) )
2019-06-06 14:21:15 +02:00
for ( Nd4jLong i = 0 ; i < len ; + + i )
z [ i * zEws ] = simdOps : : LogPoissonLoss < float , float , float > : : op ( x [ i * xEws ] , y [ i * yEws ] ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loopThroughArrs_test2 ) {
NDArray x ( ' c ' , { 40 , 25 } , nd4j : : DataType : : FLOAT32 ) ;
const int iterations = 1 ;
const int arrays = 10 ;
std : : vector < NDArray > arrs ( arrays ) ;
for ( auto & arr : arrs )
arr = x ;
//***********************************
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
srand ( 119 ) ;
for ( Nd4jLong i = 0 ; i < iterations ; + + i ) {
int xInd = rand ( ) % arrays ;
int yInd = rand ( ) % arrays ;
int zInd = rand ( ) % arrays ;
auto xBuff = arrs [ xInd ] . bufferAsT < float > ( ) ;
auto yBuff = arrs [ yInd ] . bufferAsT < float > ( ) ;
auto zBuff = arrs [ zInd ] . bufferAsT < float > ( ) ;
auto xShapeInfo = arrs [ xInd ] . getShapeInfo ( ) ;
auto yShapeInfo = arrs [ yInd ] . getShapeInfo ( ) ;
auto zShapeInfo = arrs [ zInd ] . getShapeInfo ( ) ;
loopSimple ( xBuff , xShapeInfo , yBuff , yShapeInfo , zBuff , zShapeInfo ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto simpleTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
//***********************************
timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( Nd4jLong i = 0 ; i < iterations ; + + i ) {
int xInd = rand ( ) % arrays ;
int yInd = rand ( ) % arrays ;
int zInd = rand ( ) % arrays ;
auto xBuff = arrs [ xInd ] . bufferAsT < float > ( ) ;
auto yBuff = arrs [ yInd ] . bufferAsT < float > ( ) ;
auto zBuff = arrs [ zInd ] . bufferAsT < float > ( ) ;
auto xShapeInfo = arrs [ xInd ] . getShapeInfo ( ) ;
auto yShapeInfo = arrs [ yInd ] . getShapeInfo ( ) ;
auto zShapeInfo = arrs [ zInd ] . getShapeInfo ( ) ;
loopSpan ( xBuff , xShapeInfo , yBuff , yShapeInfo , zBuff , zShapeInfo ) ;
}
timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
nd4j_printf ( " simple time: %lld us; \n " , simpleTime ) ;
nd4j_printf ( " span time: %lld us; \n " , spanTime ) ;
ASSERT_TRUE ( 1 ) ;
}
static void loop1 ( float * x , Nd4jLong * xShapeInfo , float * y , Nd4jLong * yShapeInfo , float * z , Nd4jLong * zShapeInfo ) {
auto len = shape : : length ( xShapeInfo ) ;
int xEws = shape : : elementWiseStride ( xShapeInfo ) ;
int yEws = shape : : elementWiseStride ( yShapeInfo ) ;
int zEws = shape : : elementWiseStride ( zShapeInfo ) ;
nd4j : : OmpLaunchHelper info ( len ) ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_ARGS ( num_threads ( info . _numThreads ) )
2019-06-06 14:21:15 +02:00
{
auto threadNum = omp_get_thread_num ( ) ;
Nd4jLong threadOffset = info . getThreadOffset ( threadNum ) ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SIMD
2019-06-06 14:21:15 +02:00
for ( Nd4jLong j = 0 ; j < info . getItersPerThread ( threadNum ) ; j + + ) {
Nd4jLong xOffset = shape : : getIndexOffset ( j + threadOffset , xShapeInfo , len ) ;
Nd4jLong yOffset = shape : : getIndexOffset ( j + threadOffset , yShapeInfo , len ) ;
Nd4jLong zOffset = shape : : getIndexOffset ( j + threadOffset , zShapeInfo , len ) ;
z [ xOffset ] = simdOps : : LogPoissonLoss < float , float , float > : : op ( x [ xOffset ] , y [ xOffset ] ) ;
}
}
}
static void loop2 ( float * x , Nd4jLong * xShapeInfo , float * y , Nd4jLong * yShapeInfo , float * z , Nd4jLong * zShapeInfo ) {
auto len = shape : : length ( xShapeInfo ) ;
int xEws = shape : : elementWiseStride ( xShapeInfo ) ;
int yEws = shape : : elementWiseStride ( yShapeInfo ) ;
int zEws = shape : : elementWiseStride ( zShapeInfo ) ;
int threads = 6 ;
int span_size = len / threads + 1 ;
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS ( schedule ( static ) )
2019-06-06 14:21:15 +02:00
for ( Nd4jLong i = 0 ; i < len ; + + i ) {
Nd4jLong xOffset = shape : : getIndexOffset ( i , xShapeInfo , len ) ;
Nd4jLong yOffset = shape : : getIndexOffset ( i , yShapeInfo , len ) ;
Nd4jLong zOffset = shape : : getIndexOffset ( i , zShapeInfo , len ) ;
z [ xOffset ] = simdOps : : LogPoissonLoss < float , float , float > : : op ( x [ xOffset ] , y [ xOffset ] ) ;
}
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loopThroughArrs_test3 ) {
NDArray x ( ' c ' , { 50 , 250 } , nd4j : : DataType : : FLOAT32 ) ;
const int iterations = 1 ;
const int arrays = 100 ;
std : : vector < NDArray > arrs ( arrays ) ;
for ( auto & arr : arrs )
arr = x ;
//***********************************
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
srand ( 119 ) ;
for ( Nd4jLong i = 0 ; i < iterations ; + + i ) {
int xInd = rand ( ) % arrays ;
int yInd = rand ( ) % arrays ;
int zInd = rand ( ) % arrays ;
auto xBuff = arrs [ xInd ] . bufferAsT < float > ( ) ;
auto yBuff = arrs [ yInd ] . bufferAsT < float > ( ) ;
auto zBuff = arrs [ zInd ] . bufferAsT < float > ( ) ;
auto xShapeInfo = arrs [ xInd ] . getShapeInfo ( ) ;
auto yShapeInfo = arrs [ yInd ] . getShapeInfo ( ) ;
auto zShapeInfo = arrs [ zInd ] . getShapeInfo ( ) ;
loop2 ( xBuff , xShapeInfo , yBuff , yShapeInfo , zBuff , zShapeInfo ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto simpleTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
//***********************************
timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( Nd4jLong i = 0 ; i < iterations ; + + i ) {
int xInd = rand ( ) % arrays ;
int yInd = rand ( ) % arrays ;
int zInd = rand ( ) % arrays ;
auto xBuff = arrs [ xInd ] . bufferAsT < float > ( ) ;
auto yBuff = arrs [ yInd ] . bufferAsT < float > ( ) ;
auto zBuff = arrs [ zInd ] . bufferAsT < float > ( ) ;
auto xShapeInfo = arrs [ xInd ] . getShapeInfo ( ) ;
auto yShapeInfo = arrs [ yInd ] . getShapeInfo ( ) ;
auto zShapeInfo = arrs [ zInd ] . getShapeInfo ( ) ;
loop1 ( xBuff , xShapeInfo , yBuff , yShapeInfo , zBuff , zShapeInfo ) ;
}
timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
nd4j_printf ( " simpleTime time: %lld us; \n " , simpleTime ) ;
nd4j_printf ( " spanTime time: %lld us; \n " , spanTime ) ;
ASSERT_TRUE ( 1 ) ;
}
TEST_F ( PlaygroundTests , test_batched_skipgram_1 ) {
const int batchSize = 64 ;
const int codeLen = 6 ;
const int numWords = 244 ;
const int vectorLength = 50 ;
auto target = NDArrayFactory : : create < int > ( ' c ' , { batchSize } ) ;
auto ngStarter = NDArrayFactory : : empty < int > ( ) ;
auto indices = NDArrayFactory : : create < int > ( ' c ' , { batchSize , codeLen } ) ;
auto codes = NDArrayFactory : : create < int8_t > ( ' c ' , { batchSize , codeLen } ) ;
auto syn0 = NDArrayFactory : : create < float > ( ' c ' , { numWords , vectorLength } ) ;
auto syn1 = NDArrayFactory : : create < float > ( ' c ' , { numWords , vectorLength } ) ;
auto syn1Neg = NDArrayFactory : : empty < float > ( ) ;
auto expTable = NDArrayFactory : : linspace < float > ( 0.001 , 0.995 , 10000 ) ;
auto negTable = NDArrayFactory : : empty < float > ( ) ;
auto alpha = NDArrayFactory : : create < double > ( ' c ' , { batchSize } ) ;
auto randomValue = NDArrayFactory : : create < Nd4jLong > ( ' c ' , { batchSize } ) ;
auto inferenceVector = NDArrayFactory : : empty < float > ( ) ;
syn0 . assign ( 0.01 ) ;
syn1 . assign ( 0.02 ) ;
Nd4jLong rv = 2843242345121L ;
auto lr = 0.025 ;
for ( int e = 0 ; e < batchSize ; e + + ) {
target . p ( e , e ) ;
alpha . p ( e , lr ) ;
randomValue . p ( e , rv ) ;
lr - = 0.001 ;
for ( int s = 1 ; s < codeLen ; s + + ) {
indices . p ( e , s , nd4j : : math : : nd4j_abs < Nd4jLong > ( rv % numWords ) ) ;
codes . p ( e , s , s % 2 ) ;
rv = nd4j : : math : : nd4j_abs < Nd4jLong > ( rv * 25214903917L + 11 ) ;
}
rv = nd4j : : math : : nd4j_abs < Nd4jLong > ( rv * 25214903917L + 11 ) ;
}
//indices.printIndexedBuffer("indices");
//codes.printIndexedBuffer("codes");
auto iterations = 1 ;
nd4j : : ops : : skipgram op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
auto result = op . execute ( { & target , & ngStarter , & indices , & codes , & syn0 , & syn1 , & syn1Neg , expTable , & negTable , & alpha , & randomValue , & inferenceVector } , { } , { } , { false } , true ) ;
ASSERT_EQ ( Status : : OK ( ) , result - > status ( ) ) ;
delete result ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld us; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
delete expTable ;
}
TEST_F ( PlaygroundTests , test_reduce_scalar_float_1 ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 32 , 128 , 256 , 256 } ) ;
auto target = NDArrayFactory : : create < float > ( 0.0f ) ;
// warm up
for ( int e = 0 ; e < 1 ; e + + ) {
NativeOpExecutioner : : execReduceFloatScalar ( LaunchContext : : defaultContext ( ) , reduce : : Mean , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
int iterations = 1 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
NativeOpExecutioner : : execReduceFloatScalar ( LaunchContext : : defaultContext ( ) , reduce : : Mean , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld us; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
}
TEST_F ( PlaygroundTests , test_reduce_scalar_float_2 ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 100000 } ) ;
auto target = NDArrayFactory : : create < float > ( 0.0f ) ;
// warm up
for ( int e = 0 ; e < 1 ; e + + ) {
NativeOpExecutioner : : execReduceFloatScalar ( LaunchContext : : defaultContext ( ) , reduce : : ReduceFloatBenchmarkOp , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
int iterations = 1 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
NativeOpExecutioner : : execReduceFloatScalar ( LaunchContext : : defaultContext ( ) , reduce : : ReduceFloatBenchmarkOp , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld ns; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
}
TEST_F ( PlaygroundTests , test_reduce_scalar_same_2 ) {
auto array = NDArrayFactory : : create < float > ( ' c ' , { 100000 } ) ;
auto target = NDArrayFactory : : create < float > ( 0.0f ) ;
// warm up
for ( int e = 0 ; e < 1 ; e + + ) {
NativeOpExecutioner : : execReduceSameScalar ( LaunchContext : : defaultContext ( ) , reduce : : ReduceSameBenchmarkOp , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
int iterations = 1 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
NativeOpExecutioner : : execReduceSameScalar ( LaunchContext : : defaultContext ( ) , reduce : : ReduceSameBenchmarkOp , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , nullptr , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld ns; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
}
TEST_F ( PlaygroundTests , test_assign_float ) {
// auto array = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
// auto target = NDArrayFactory::create<float>('c', {32, 128, 256, 256});
auto array = NDArrayFactory : : create < float > ( ' c ' , { 32 , 64 , 128 , 128 } ) ;
auto target = NDArrayFactory : : create < float > ( ' c ' , { 32 , 64 , 128 , 128 } ) ;
array . assign ( 119 ) ;
// warm up
for ( int e = 0 ; e < 5 ; e + + ) {
NativeOpExecutioner : : execTransformAny ( LaunchContext : : defaultContext ( ) , transform : : Assign , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) , nullptr , nullptr , nullptr ) ;
}
int iterations = 1 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
NativeOpExecutioner : : execTransformAny ( LaunchContext : : defaultContext ( ) , transform : : Assign , array . buffer ( ) , array . shapeInfo ( ) , array . specialBuffer ( ) , array . specialShapeInfo ( ) , target . buffer ( ) , target . shapeInfo ( ) , target . specialBuffer ( ) , target . specialShapeInfo ( ) , nullptr , nullptr , nullptr ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
auto bw = ( 1000000L * ( float ) ( array . lengthOf ( ) * array . sizeOfT ( ) ) / spanTime ) / 1024 / 1024 / 1024 ;
nd4j_printf ( " average time: %lld us; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
nd4j_printf ( " Bandwidth: %f GB/s \n " , bw )
}
/*
TEST_F ( PlaygroundTests , test_manual_loop ) {
const unsigned int len = 32 * 128 * 256 * 256 ;
auto array = new float [ len ] ;
auto z = new float [ len ] ;
for ( unsigned int e = 0 ; e < len ; e + + )
array [ e ] = ( float ) e ;
const int iterations = 100 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < iterations ; i + + ) {
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_FOR_ARGS ( num_threads ( 4 ) schedule ( static , 32768 ) )
2019-06-06 14:21:15 +02:00
for ( unsigned int e = 0 ; e < len ; e + + )
z [ e ] = array [ e ] ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
auto bw = ( 1000000L * ( float ) ( len * sizeof ( float ) ) / spanTime ) / 1024 / 1024 / 1024 ;
nd4j_printf ( " length: %i \n " , len ) ;
nd4j_printf ( " average time: %lld us; \n " , spanTime ) ;
nd4j_printf ( " total time: %lld ms; \n " , ttlTime ) ;
nd4j_printf ( " Bandwidth: %f GB/s \n " , bw )
delete [ ] array ;
delete [ ] z ;
}
TEST_F ( PlaygroundTests , test_col2im_permuted_1 ) {
auto x = NDArrayFactory : : create < float > ( ' c ' , { 8 , 64 , 55 , 55 , 3 , 3 } ) ;
x . assign ( 1.f ) ;
x . permutei ( { 0 , 1 , 4 , 5 , 2 , 3 } ) ;
auto z0 = NDArrayFactory : : create < float > ( ' c ' , { 64 , 8 , 112 , 112 } ) ;
z0 . permutei ( { 1 , 0 , 2 , 3 } ) ;
auto z1 = NDArrayFactory : : create < float > ( ' c ' , { 64 , 8 , 112 , 112 } ) ;
z1 . permutei ( { 1 , 0 , 2 , 3 } ) ;
nd4j_printf ( " Starting custom run... \n " , " " ) ;
const int iterations = 100 ;
nd4j : : ops : : col2im op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + ) {
op . execute ( { & x } , { & z0 } , { } , { 2 , 2 , 0 , 0 , 112 , 112 , 1 , 1 , 1 } , { } ) ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
nd4j_printf ( " Starting legacy run... \n " , " " ) ;
ExtraArguments arguments ( { 2. , 2. , 0. , 0. , 112. , 112. , 1. , 1. } ) ;
auto legacyStart = std : : chrono : : system_clock : : now ( ) ;
for ( int e = 0 ; e < iterations ; e + + )
x . applyTransform ( transform : : Col2Im , & z1 , & arguments ) ;
auto legacyEnd = std : : chrono : : system_clock : : now ( ) ;
auto legacySpanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( legacyEnd - legacyStart ) / iterations ) . count ( ) ;
auto legacyTtlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( legacyEnd - legacyStart ) ) . count ( ) ;
nd4j_printf ( " average time: %lld us vs %lld us; \n " , spanTime , legacySpanTime ) ;
nd4j_printf ( " total time: %lld ms vs %lld ms; \n " , ttlTime , legacyTtlTime ) ;
ASSERT_EQ ( z0 , z1 ) ;
}
TEST_F ( PlaygroundTests , test_addi_assign ) {
int iterations = 1 ;
auto x = NDArrayFactory : : create < float > ( ' c ' , { 1000000000 } ) ;
auto z = NDArrayFactory : : create < float > ( ' c ' , { 1000000000 } ) ;
x . assign ( 119.0f ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
x . applyScalar ( scalar : : Add , 1.0f , & z , nullptr ) ;
//z.assign(x);
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto spanTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / iterations ) . count ( ) ;
auto ttlTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) ) . count ( ) ;
auto bw = ( 1000000L * ( float ) ( x . lengthOf ( ) * x . sizeOfT ( ) ) / spanTime ) / 1024 / 1024 / 1024 ;
nd4j_printf ( " Avg add(1.0f) time: %lld us \n " , spanTime ) ;
nd4j_printf ( " Bandwidth: %f GB/s \n " , bw ) ;
}
/////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , conv2d_1 ) {
const int N = 100 ;
int bS = 8 , iH = 64 , iW = 64 , iC = 32 , oC = 32 , kH = 2 , kW = 2 , sH = 1 , sW = 1 , pH = 0 , pW = 0 , dH = 1 , dW = 1 ;
int paddingMode = 1 ; // 1-SAME, 0-VALID;
int dataFormat = 0 ; // 1-NHWC, 0-NCHW
NDArray input ( ' c ' , { bS , iC , iH , iW } , nd4j : : DataType : : FLOAT32 ) ;
NDArray output ( input ) ;
NDArray weights ( ' c ' , { kH , kW , iC , oC } , nd4j : : DataType : : FLOAT32 ) ;
NDArray bias ( ' c ' , { oC } , nd4j : : DataType : : FLOAT32 ) ;
input = 2. ;
weights . linspace ( 0.1 , 0.1 ) ;
bias = 0.5 ;
nd4j : : ops : : conv2d op ;
for ( int i = 0 ; i < 10 ; i + + )
100.5 * 0.5 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
op . execute ( { & input , & weights , & bias } , { & output } , { } , { kH , kW , sH , sW , pH , pW , dH , dW , paddingMode , dataFormat } , { } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
printf ( " duration %ld \n " , duration ) ;
}
*/
/////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , batchnorm_1 ) {
const int N = 1 ;
NDArray input ( ' c ' , { 8 , 32 , 64 , 64 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray output ( ' c ' , { 8 , 32 , 64 , 64 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray mean ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray variance ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray gamma ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray beta ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
input = 10.5 ;
mean = 5.5 ;
variance = 1.5 ;
gamma = 0.5 ;
beta = 2.5 ;
nd4j : : ops : : batchnorm_new op ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
// for (int i = 0; i <N ; i++)
op . execute ( { & input , & mean , & variance , & gamma , & beta } , { & output } , { 1e-5 } , { 1 , 1 , 1 } , { } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
printf ( " duration %ld \n " , duration ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , softmax_1 ) {
const int N = 1 ;
NDArray input ( ' c ' , { 1024 , 256 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray output ( ' c ' , { 1024 , 256 } , nd4j : : DataType : : FLOAT32 ) ;
input . linspace ( - 100. , 0.01 ) ;
nd4j : : ops : : softmax op ;
for ( int i = 0 ; i < 20 ; i + + )
100.5 * 100.5 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
op . execute ( { & input } , { & output } , { } , { 1 } , { } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
printf ( " duration %ld \n " , duration ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , subarr_1 ) {
NDArray x ( ' c ' , { 10 , 5 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray subArr1 = x ( { 0 , 0 , 3 , 4 } ) ;
NDArray subArr2 = x ( { 0 , 0 , 3 , 4 } , true ) ;
subArr1 . printShapeInfo ( " subArr1 " ) ;
subArr2 . printShapeInfo ( " subArr2 " ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , subarr_2 ) {
NDArray x ( ' c ' , { 10 , 5 } , nd4j : : DataType : : FLOAT32 ) ;
auto subArr1 = x . subarray ( { NDIndex : : all ( ) , NDIndex : : point ( 2 ) } ) ;
subArr1 - > printShapeInfo ( " subArr1 " ) ;
ASSERT_EQ ( 5 , subArr1 - > ews ( ) ) ;
delete subArr1 ;
}
////////////////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loops_1 ) {
/*
const int N = 1 ;
NDArray x ( ' c ' , { 16 , 32 , 64 , 64 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray z1 ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray z2 ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
NDArray z3 ( ' c ' , { 32 } , nd4j : : DataType : : FLOAT32 ) ;
std : : vector < int > dimsToExclude = { 0 , 2 , 3 } ;
std : : vector < int > tadDims = { 1 } ;
x . linspace ( 0.01 ) ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i )
32 * 512 ;
auto timeStart1 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
x . reduceAlongDimension ( nd4j : : reduce : : Mean , & z1 , dimsToExclude ) ;
auto timeEnd1 = std : : chrono : : system_clock : : now ( ) ;
auto duration1 = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd1 - timeStart1 ) / N ) . count ( ) ;
auto timeStartE = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
x . reduceAlongDimension ( nd4j : : reduce : : Sum , & z3 , dimsToExclude ) ;
auto timeEndE = std : : chrono : : system_clock : : now ( ) ;
auto durationE = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEndE - timeStartE ) / N ) . count ( ) ;
Nd4jLong * tadShapeInfo ( nullptr ) , * tadOffsets ( nullptr ) ;
x . getSubArrShapeAndOffsets ( tadDims , tadShapeInfo , tadOffsets ) ;
// shape::printShapeInfoLinear(tadShapeInfo);
// shape::printIntArray(tadOffsets, 32);
auto timeStart2 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
Loops : : loopReduce < float , float , float > ( x . bufferAsT < float > ( ) , tadShapeInfo , tadOffsets ,
z2 . bufferAsT < float > ( ) , z2 . getShapeInfo ( ) ,
nullptr ,
& simdOps : : Mean < float , float > : : startingValue ,
& simdOps : : Mean < float , float > : : update ,
& simdOps : : Mean < float , float > : : op ,
& simdOps : : Mean < float , float > : : postProcess ) ;
auto timeEnd2 = std : : chrono : : system_clock : : now ( ) ;
auto duration2 = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd2 - timeStart2 ) / N ) . count ( ) ;
RELEASE ( tadShapeInfo , x . getWorkspace ( ) ) ;
RELEASE ( tadOffsets , x . getWorkspace ( ) ) ;
// z1.printIndexedBuffer("z1 ");
// z2.printIndexedBuffer("z2 ");
ASSERT_TRUE ( z1 . equalsTo ( z2 ) ) ;
printf ( " duration old: %ld \n " , duration1 ) ;
printf ( " duration new: %ld \n " , duration2 ) ;
printf ( " duration E: %ld \n " , durationE ) ;
*/
}
////////////////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , newTads_1 ) {
const int N = 1 ;
Nd4jLong shapeInfo [ ] = { 4 , 1024 , 1024 , 1024 , 1024 , 1024 * 1024 * 1024 , 1024 * 1024 , 1024 , 1 , 16384 , 1 , 99 } ;
const int rank = shape : : rank ( shapeInfo ) ;
const std : : vector < int > dimsToExclude = { 1 , 3 } ;
const std : : vector < int > tadDims = { 0 , 2 } ;
const bool keepUnitesInShape = false ;
const Nd4jLong numOfSubArrs = ShapeUtils : : getNumOfSubArrs ( shapeInfo , dimsToExclude ) ;
const int subArrRank = ( rank = = dimsToExclude . size ( ) | | keepUnitesInShape ) ? rank : rank - dimsToExclude . size ( ) ;
auto sPtr = new Nd4jLong [ shape : : shapeInfoLength ( subArrRank ) ] ;
auto oPtr = new Nd4jLong [ numOfSubArrs ] ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i )
32 * 512 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + )
auto tadPack = nd4j : : ConstantTadHelper : : getInstance ( ) - > tadForDimensions ( shapeInfo , tadDims ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
printf ( " duration old: %ld \n " , duration ) ;
delete [ ] sPtr ;
delete [ ] oPtr ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loops_2 ) {
const uint N = 5 ;
const Nd4jLong dim0 ( 10 ) , dim1 ( 10 ) , dim2 ( 10 ) ;
const Nd4jLong shapeInfo [ 2 * 3 + 4 ] = { 3 , dim0 , dim1 , dim2 , 1 , dim0 , dim0 * dim1 , 8192 , 1 , 102 } ;
const Nd4jLong len = shape : : length ( shapeInfo ) ;
float * buff = new float [ len ] ;
const Nd4jLong * shape = shape : : shapeOf ( const_cast < Nd4jLong * > ( shapeInfo ) ) ;
const Nd4jLong * strides = shape : : stride ( const_cast < Nd4jLong * > ( shapeInfo ) ) ;
// OmpLaunchHelper threadsInfo(len);
Nd4jLong * xOffsets , * yOffsets , * zOffsets ;
xOffsets = new Nd4jLong [ len ] ;
yOffsets = new Nd4jLong [ len ] ;
zOffsets = new Nd4jLong [ len ] ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i ) 32 * 512 ;
//***********************************
//***********************************
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; + + i )
{
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_SECTIONS
2019-06-06 14:21:15 +02:00
{
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , xOffsets ) ;
}
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , yOffsets ) ;
}
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , zOffsets ) ;
}
}
PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( uint i = 0 ; i < len ; i + + )
buff [ zOffsets [ i ] ] = buff [ xOffsets [ i ] ] * buff [ yOffsets [ i ] ] ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto myTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
//***********************************
//***********************************
timeStart = std : : chrono : : system_clock : : now ( ) ;
uint xShapeInfoCast [ MAX_RANK ] ;
uint yShapeInfoCast [ MAX_RANK ] ;
uint zShapeInfoCast [ MAX_RANK ] ;
bool canCastX = DataTypeUtils : : castShapeInfo ( shapeInfo , xShapeInfoCast ) ;
bool canCastY = DataTypeUtils : : castShapeInfo ( shapeInfo , yShapeInfoCast ) ;
bool canCastZ = DataTypeUtils : : castShapeInfo ( shapeInfo , zShapeInfoCast ) ;
for ( int i = 0 ; i < N ; + + i )
{
PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( uint i = 0 ; i < len ; i + + ) {
auto xOffset = shape : : indexOffset ( i , shapeInfo , xShapeInfoCast , len , canCastX ) ;
auto yOffset = shape : : indexOffset ( i , shapeInfo , yShapeInfoCast , len , canCastY ) ;
auto zOffset = shape : : indexOffset ( i , shapeInfo , zShapeInfoCast , len , canCastZ ) ;
buff [ zOffset ] = buff [ xOffset ] * buff [ yOffset ] ;
}
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
// for (uint i0 = 0; i0 < shape[0]; ++i0)
// for (uint i1 = 0; i1 < shape[1]; ++i1)
// for (uint i2 = 0; i2 < shape[2]; ++i2)
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
}
timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto oldTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
nd4j_printf ( " My time: %lld us; \n " , myTime ) ;
nd4j_printf ( " Old time: %lld us; \n " , oldTime ) ;
delete [ ] xOffsets ;
delete [ ] yOffsets ;
delete [ ] zOffsets ;
delete [ ] buff ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loops_3 ) {
const uint N = 5 ;
// const Nd4jLong dim0(1024), dim1(1024), dim2(1024);
const Nd4jLong dim0 ( 10 ) , dim1 ( 10 ) , dim2 ( 10 ) ;
const Nd4jLong shapeInfo [ 2 * 3 + 4 ] = { 3 , dim0 , dim1 , dim2 , dim1 * dim2 , dim2 , 1 , 8192 , 1 , 99 } ;
const Nd4jLong len = shape : : length ( shapeInfo ) ;
float * buff = new float [ len ] ;
const Nd4jLong * shape = shape : : shapeOf ( const_cast < Nd4jLong * > ( shapeInfo ) ) ;
const Nd4jLong * strides = shape : : stride ( const_cast < Nd4jLong * > ( shapeInfo ) ) ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i ) 32 * 512 ;
//***********************************
//***********************************
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; + + i )
{
Nd4jLong * idxX = new Nd4jLong [ 3 ] ;
Nd4jLong * idxY = new Nd4jLong [ 3 ] ;
Nd4jLong * idxZ = new Nd4jLong [ 3 ] ;
Nd4jLong * offsetPerDimX = new Nd4jLong [ 3 ] ;
Nd4jLong * offsetPerDimY = new Nd4jLong [ 3 ] ;
Nd4jLong * offsetPerDimZ = new Nd4jLong [ 3 ] ;
memset ( idxX , 0 , sizeof ( Nd4jLong ) * 3 ) ;
memset ( idxY , 0 , sizeof ( Nd4jLong ) * 3 ) ;
memset ( idxZ , 0 , sizeof ( Nd4jLong ) * 3 ) ;
PRAGMA_OMP_SIMD
for ( int k = 0 ; k < 3 ; + + k ) {
offsetPerDimX [ k ] = ( shape [ k ] - 1 ) * strides [ k ] ;
offsetPerDimY [ k ] = ( shape [ k ] - 1 ) * strides [ k ] ;
offsetPerDimZ [ k ] = ( shape [ k ] - 1 ) * strides [ k ] ;
}
Nd4jLong initX ( 0 ) , initY ( 0 ) , initZ ( 0 ) , offsetsX ( 0 ) , offsetsY ( 0 ) , offsetsZ ( 0 ) ;
Nd4jLong rankMinusOne ( 3 - 1 ) , jX ( rankMinusOne ) , jY ( rankMinusOne ) , jZ ( rankMinusOne ) ;
// we do first iteration separately
buff [ offsetsZ ] = buff [ offsetsX ] * buff [ offsetsY ] ;
uint e = 1 ;
while ( e < len ) {
// printf("%lld, %lld, %lld\n", jX, jY, jZ);
if ( shape [ jX ] = = 1 ) { - - jX ; - - jY ; - - jZ ; continue ; }
if ( jX = = rankMinusOne ) { for ( int l = 1 ; l < shape [ jX ] ; + + l ) { offsetsX + = strides [ jX ] ; + + e ; } - - jX ; }
else if ( idxX [ jX ] < shape [ jX ] - 1 ) { initX + = strides [ jX ] ; offsetsX = initX ; + + idxX [ jX ] ; jX = rankMinusOne ; + + e ; }
else { initX - = offsetPerDimX [ jX ] ; idxX [ jX - - ] = 0 ; }
if ( jY = = rankMinusOne ) { for ( int l = 1 ; l < shape [ jY ] ; + + l ) { offsetsY + = strides [ jY ] ; } - - jY ; }
else if ( idxY [ jY ] < shape [ jY ] - 1 ) { initY + = strides [ jY ] ; offsetsY = initY ; + + idxY [ jY ] ; jY = rankMinusOne ; }
else { initY - = offsetPerDimY [ jY ] ; idxY [ jY - - ] = 0 ; }
if ( jZ = = rankMinusOne ) { for ( int l = 1 ; l < shape [ jZ ] ; + + l ) { offsetsZ + = strides [ jZ ] ; } - - jZ ; }
else if ( idxZ [ jZ ] < shape [ jZ ] - 1 ) { initZ + = strides [ jZ ] ; offsetsZ = initZ ; + + idxZ [ jZ ] ; jZ = rankMinusOne ; }
else { initZ - = offsetPerDimZ [ jZ ] ; idxZ [ jZ - - ] = 0 ; }
buff [ offsetsZ ] = buff [ offsetsX ] * buff [ offsetsY ] ;
}
delete [ ] idxX ;
delete [ ] idxY ;
delete [ ] idxZ ;
delete [ ] offsetPerDimX ;
delete [ ] offsetPerDimY ;
delete [ ] offsetPerDimZ ;
}
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto myTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
//***********************************
//***********************************
timeStart = std : : chrono : : system_clock : : now ( ) ;
// uint xShapeInfoCast[MAX_RANK];
// uint yShapeInfoCast[MAX_RANK];
// uint zShapeInfoCast[MAX_RANK];
// bool canCastX = DataTypeUtils::castShapeInfo(shapeInfo, xShapeInfoCast);
// bool canCastY = DataTypeUtils::castShapeInfo(shapeInfo, yShapeInfoCast);
// bool canCastZ = DataTypeUtils::castShapeInfo(shapeInfo, zShapeInfoCast);
// for (int i = 0; i < N; ++i)
// {
// PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(1)
// for (uint i0 = 0; i0 < shape[0]; ++i0)
// for (uint i1 = 0; i1 < shape[1]; ++i1)
// for (uint i2 = 0; i2 < shape[2]; ++i2)
// buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] = buff[i0*strides[0]+i1*strides[1]+i2*strides[2]] * buff[i0*strides[0]+i1*strides[1]+i2*strides[2]];
// }
Nd4jLong * xOffsets , * yOffsets , * zOffsets ;
xOffsets = new Nd4jLong [ len ] ;
yOffsets = new Nd4jLong [ len ] ;
zOffsets = new Nd4jLong [ len ] ;
for ( int i = 0 ; i < N ; + + i )
{
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_PARALLEL_SECTIONS
2019-06-06 14:21:15 +02:00
{
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , xOffsets ) ;
}
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , yOffsets ) ;
}
2019-07-18 13:13:56 +02:00
PRAGMA_OMP_SECTION
2019-06-06 14:21:15 +02:00
{
shape : : calcOffsets ( 3 , shape , strides , zOffsets ) ;
}
}
PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( uint i = 0 ; i < len ; i + + )
buff [ zOffsets [ i ] ] = buff [ xOffsets [ i ] ] * buff [ yOffsets [ i ] ] ;
}
timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto oldTime = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
delete [ ] xOffsets ;
delete [ ] yOffsets ;
delete [ ] zOffsets ;
nd4j_printf ( " My time: %lld us; \n " , myTime ) ;
nd4j_printf ( " Old time: %lld us; \n " , oldTime ) ;
delete [ ] buff ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loops_4 ) {
const uint N = 2 ;
// const Nd4jLong dim0(256), dim1(256), dim2(256), dim3(256);
const Nd4jLong dim0 ( 10 ) , dim1 ( 10 ) , dim2 ( 10 ) , dim3 ( 10 ) ;
NDArray x ( ' c ' , { dim0 , dim1 , dim2 , dim3 } ) ;
NDArray z ( ' c ' , { dim0 , dim2 } ) ;
x = 0.1 ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i ) 32 * 512 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( uint i = 0 ; i < N ; + + i )
x . reduceAlongDimension ( reduce : : Sum , & z , { 1 , 3 } ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto myTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
nd4j_printf ( " My time: %lld us; \n " , myTime ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , loops_5 ) {
const uint N = 2 ;
// const Nd4jLong dim0(1024), dim1(1024), dim2(256);
const Nd4jLong dim0 ( 10 ) , dim1 ( 10 ) , dim2 ( 10 ) ;
NDArray x ( ' c ' , { dim0 , dim1 , dim2 } ) ;
NDArray z ( ' c ' , { dim0 , dim1 , dim2 } ) ;
// provide worst case
* shape : : ews ( x . shapeInfo ( ) ) = 0 ;
* shape : : ews ( z . shapeInfo ( ) ) = 0 ;
x = 0.1 ;
// warm up
for ( int i = 0 ; i < 1000 ; + + i ) 32 * 512 ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( uint i = 0 ; i < N ; + + i )
x . applyTransform ( transform : : Log , & z ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto myTime = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd - timeStart ) / N ) . count ( ) ;
nd4j_printf ( " My time: %lld us; \n " , myTime ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , im2col_1 ) {
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int bS = 2 , iH = 4 , iW = 4 , iC = 3 , kH = 3 , kW = 3 , sH = 1 , sW = 1 , pH = 0 , pW = 0 , dH = 1 , dW = 1 ;
int oH = ( iH - ( kH + ( kH - 1 ) * ( dH - 1 ) ) + 2 * pH ) / sH + 1 ; // VALID
int oW = ( iW - ( kW + ( kW - 1 ) * ( dW - 1 ) ) + 2 * pW ) / sW + 1 ; // VALID
NDArray image ( ' c ' , { bS , iC , iH , iW } , nd4j : : DataType : : FLOAT32 ) ;
NDArray column ( ' c ' , { bS , iC , kH , kW , oH , oW } , nd4j : : DataType : : FLOAT32 ) ;
nd4j : : LaunchContext * context = image . getContext ( ) ;
NDArray padValue ( nd4j : : DataType : : FLOAT32 , context ) ; // scalar =0
image . linspace ( 1 , 1 ) ;
const int N = 1 ;
// warm up
nd4j : : ops : : helpers : : im2col ( * context , image , column , kH , kW , sH , sW , pH , pW , dH , dW , padValue ) ; // warm up
// ---------------------------------------- //
auto timeStart1 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + ) {
nd4j : : ops : : helpers : : im2col ( * context , image , column , kH , kW , sH , sW , pH , pW , dH , dW , padValue ) ;
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std : : chrono : : system_clock : : now ( ) ;
auto duration1 = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd1 - timeStart1 ) / N ) . count ( ) ;
printf ( " duration my %ld \n " , duration1 ) ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , im2col_2 ) {
// int bS=32, iH=244,iW=244, iC=3, kH=3,kW=3, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int bS = 2 , iH = 4 , iW = 4 , iC = 3 , kH = 3 , kW = 3 , sH = 1 , sW = 1 , pH = 0 , pW = 0 , dH = 1 , dW = 1 ;
int oH = ( iH - ( kH + ( kH - 1 ) * ( dH - 1 ) ) + 2 * pH ) / sH + 1 ; // VALID
int oW = ( iW - ( kW + ( kW - 1 ) * ( dW - 1 ) ) + 2 * pW ) / sW + 1 ; // VALID
NDArray image ( ' c ' , { bS , iC , iH , iW } , nd4j : : DataType : : FLOAT32 ) ;
NDArray column ( ' c ' , { bS , iC , kH , kW , oH , oW } , nd4j : : DataType : : FLOAT32 ) ;
nd4j : : LaunchContext * context = image . getContext ( ) ;
image . linspace ( 1 , 1 ) ;
ExtraArguments extras ( std : : vector < double > ( { ( double ) kH , ( double ) kW , ( double ) sH , ( double ) sW , ( double ) pH , ( double ) pW , ( double ) dH , ( double ) dW , 0. , 0. } ) ) ;
const int N = 1 ;
// warm up
void * params = extras . argumentsAsT ( column . dataType ( ) ) ;
NativeOpExecutioner : : execTransformSame ( context , nd4j : : transform : : Im2col , image . buffer ( ) , image . getShapeInfo ( ) , image . getSpecialBuffer ( ) , image . getSpecialShapeInfo ( ) , column . buffer ( ) , column . getShapeInfo ( ) , column . getSpecialBuffer ( ) , column . getSpecialShapeInfo ( ) , params , nullptr , nullptr ) ;
// ---------------------------------------- //
auto timeStart2 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + ) {
NativeOpExecutioner : : execTransformSame ( context , nd4j : : transform : : Im2col ,
image . buffer ( ) , image . getShapeInfo ( ) , image . getSpecialBuffer ( ) , image . getSpecialShapeInfo ( ) ,
column . buffer ( ) , column . getShapeInfo ( ) , column . getSpecialBuffer ( ) , column . getSpecialShapeInfo ( ) ,
params ,
nullptr , nullptr ) ;
}
auto timeEnd2 = std : : chrono : : system_clock : : now ( ) ;
auto duration2 = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd2 - timeStart2 ) / N ) . count ( ) ;
printf ( " duration old %ld \n " , duration2 ) ;
}
/*
TEST_F ( PlaygroundTests , test_scatter_119 ) {
auto output = NDArrayFactory : : create < float > ( ' c ' , { 65536 , 512 } ) ;
auto updates = NDArrayFactory : : create < float > ( ' c ' , { 65536 , 512 } ) ;
auto indices = NDArrayFactory : : create < int > ( ' c ' , { 65536 } ) ;
int p = 0 ;
for ( int e = 65534 ; e > = 0 ; e - - )
indices . p ( p + + , e ) ;
indices . syncToDevice ( ) ;
int N = 1 ;
auto timeStart1 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + ) {
helpers : : scatter ( LaunchContext : : defaultContext ( ) , pairwise : : CopyPws , indices , updates , output , false ) ;
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std : : chrono : : system_clock : : now ( ) ;
auto duration1 = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd1 - timeStart1 ) / N ) . count ( ) ;
nd4j_printf ( " duration my %ld \n " , duration1 ) ;
}
TEST_F ( PlaygroundTests , test_scatter_120 ) {
auto output = NDArrayFactory : : create_ < float > ( ' c ' , { 65536 , 512 } ) ;
auto updates = NDArrayFactory : : create_ < float > ( ' c ' , { 65536 , 512 } ) ;
auto indices = NDArrayFactory : : create_ < int > ( ' c ' , { 65536 } ) ;
int p = 0 ;
for ( int e = 65534 ; e > = 0 ; e - - )
indices - > p ( p + + , e ) ;
indices - > syncToDevice ( ) ;
int N = 1 ;
auto timeStart1 = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < N ; i + + ) {
helpers : : scatter ( LaunchContext : : defaultContext ( ) , pairwise : : CopyPws , * indices , * updates , * output , false ) ;
// FIXME: do not use cuda methods in generic code
//cudaStreamSynchronize(*context->getCudaStream());
}
auto timeEnd1 = std : : chrono : : system_clock : : now ( ) ;
auto duration1 = std : : chrono : : duration_cast < std : : chrono : : milliseconds > ( ( timeEnd1 - timeStart1 ) / N ) . count ( ) ;
nd4j_printf ( " duration my %ld \n " , duration1 ) ;
delete output ;
delete indices ;
delete updates ;
}
//////////////////////////////////////////////////////////////////////
TEST_F ( PlaygroundTests , mmulMxM_1 ) {
const int numOfIters = 100 ;
const Nd4jLong M = 1024 ;
const Nd4jLong K = 1024 ;
const Nd4jLong N = 1024 ;
NDArray a ( ' f ' , { M , K } , nd4j : : DataType : : FLOAT32 ) ;
NDArray b ( ' f ' , { K , N } , nd4j : : DataType : : FLOAT32 ) ;
NDArray c ( ' c ' , { M , N } , nd4j : : DataType : : FLOAT32 ) ;
auto timeStart = std : : chrono : : system_clock : : now ( ) ;
for ( int i = 0 ; i < numOfIters ; + + i )
nd4j : : MmulHelper : : mmul ( & a , & b , & c , 1. , 0. ) ;
auto timeEnd = std : : chrono : : system_clock : : now ( ) ;
auto duration1 = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( ( timeEnd - timeStart ) / numOfIters ) . count ( ) ;
printf ( " duration %ld \n " , duration1 ) ;
}
2019-07-18 13:13:56 +02:00
*/