2019-06-06 14:21:15 +02:00
/*******************************************************************************
* Copyright ( c ) 2015 - 2018 Skymind , Inc .
*
* This program and the accompanying materials are made available under the
* terms of the Apache License , Version 2.0 which is available at
* https : //www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS , WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied . See the
* License for the specific language governing permissions and limitations
* under the License .
*
* SPDX - License - Identifier : Apache - 2.0
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
//
// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
//
# include <ops/declarable/helpers/transforms.h>
# include <array/ResultSet.h>
# include <helpers/ShapeUtils.h>
# include <numeric>
# include <NDArrayFactory.h>
# include <helpers/TAD.h>
# include <helpers/ConstantTadHelper.h>
# include <Loops.h>
2019-07-20 07:58:44 +02:00
# include <graph/RandomGenerator.h>
2019-06-06 14:21:15 +02:00
namespace nd4j {
namespace ops {
namespace helpers {
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void triuBP_ ( nd4j : : LaunchContext * context , const NDArray & input , const NDArray & gradO , NDArray & gradI , const int diagonal ) {
auto dOdI = NDArray ( & gradO ) ; // dO/dI
const_cast < NDArray & > ( input ) . fillAsTriangular < T > ( 0 , diagonal , dOdI . sizeAt ( - 1 ) , ' b ' , & dOdI ) ;
int dLen = dOdI . lengthOf ( ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
if ( dOdI . t < T > ( i ) ! = static_cast < T > ( 0.f ) )
dOdI . t < T > ( i ) = static_cast < T > ( 1.f ) ;
}
} ;
samediff : : Threads : : parallel_for ( func , 0 , dLen ) ;
2019-06-06 14:21:15 +02:00
// FIXME: !!!
gradI . assign ( dOdI * gradO ) ; // chain rule: dLoss/dI = dO/dI * dLoss/dO
}
void triuBP ( nd4j : : LaunchContext * context , const NDArray & input , const NDArray & gradO , NDArray & gradI , const int diagonal ) {
BUILD_SINGLE_SELECTOR ( gradO . dataType ( ) , triuBP_ , ( context , input , gradO , gradI , diagonal ) , LIBND4J_TYPES ) ;
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void trace_ ( const NDArray & input , NDArray & output ) {
const int inRank = input . rankOf ( ) ;
auto setOfSubArrs = input . allTensorsAlongDimension ( { inRank - 2 , inRank - 1 } ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment )
output . p ( i , setOfSubArrs - > at ( i ) - > getTrace ( ) ) ;
} ;
samediff : : Threads : : parallel_for ( func , 0 , setOfSubArrs - > size ( ) ) ;
2019-06-06 14:21:15 +02:00
delete setOfSubArrs ;
}
void trace ( nd4j : : LaunchContext * context , const NDArray & input , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , trace_ , ( input , output ) , LIBND4J_TYPES ) ;
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
2019-07-20 07:58:44 +02:00
void randomShuffle_ ( NDArray & input , NDArray & output , nd4j : : graph : : RandomGenerator & rng , const bool isInplace ) {
2019-06-06 14:21:15 +02:00
// check edge cases first
int temp ;
const int firstDim = input . sizeAt ( 0 ) ;
if ( input . lengthOf ( ) = = 1 | | firstDim = = 1 ) {
if ( ! isInplace )
output . assign ( input ) ;
}
else if ( input . isVector ( ) | | shape : : isLikeVector ( input . getShapeInfo ( ) , temp ) ) {
// apply Fisher-Yates shuffle
if ( isInplace ) {
2019-07-20 07:58:44 +02:00
//PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
2019-06-06 14:21:15 +02:00
for ( int i = firstDim - 1 ; i > 0 ; - - i ) {
2019-07-20 07:58:44 +02:00
int r = rng . relativeInt ( i ) % i ;
2019-06-06 14:21:15 +02:00
if ( i = = r )
continue ;
2019-07-20 07:58:44 +02:00
T t0 = input . t < T > ( i ) ;
T t1 = input . t < T > ( r ) ;
2019-06-06 14:21:15 +02:00
//math::nd4j_swap<T>(input(i), input(r));
2019-07-20 07:58:44 +02:00
input . t < T > ( i ) = t1 ;
input . t < T > ( r ) = t0 ;
2019-06-06 14:21:15 +02:00
}
}
else {
std : : vector < int > indices ( firstDim ) ;
std : : iota ( indices . begin ( ) , indices . end ( ) , 0 ) ;
output . p < T > ( Nd4jLong ( 0 ) , input . e < T > ( 0 ) ) ;
2019-11-13 15:15:18 +01:00
// FIXME: parallelism!!
2019-06-06 14:21:15 +02:00
for ( int i = firstDim - 1 ; i > 0 ; - - i ) {
2019-07-20 07:58:44 +02:00
int r = rng . relativeInt ( i ) % i ;
output . t < T > ( i ) = input . t < T > ( indices [ r ] ) ;
2019-06-06 14:21:15 +02:00
if ( i = = r )
continue ;
2019-07-20 07:58:44 +02:00
output . t < T > ( r ) = input . t < T > ( indices [ i ] ) ;
2019-06-06 14:21:15 +02:00
math : : nd4j_swap < int > ( indices [ i ] , indices [ r ] ) ;
}
rng . rewindH ( firstDim - 1 ) ;
}
}
else {
// evaluate sub-arrays list of input array through all dimensions excluding first one
std : : vector < int > dimensions = ShapeUtils : : evalDimsToExclude ( input . rankOf ( ) , { 0 } ) ;
auto subArrsListIn = input . allTensorsAlongDimension ( dimensions ) ;
// apply Fisher-Yates shuffle
if ( isInplace ) {
2019-07-20 07:58:44 +02:00
//PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
for ( int i = firstDim - 1 ; i > 0 ; - - i ) {
int r = rng . relativeInt ( i ) % i ;
2019-06-06 14:21:15 +02:00
if ( i = = r )
continue ;
subArrsListIn - > at ( i ) - > swapUnsafe ( * subArrsListIn - > at ( r ) ) ;
}
}
else {
// evaluate sub-arrays list of output array through all dimensions excluding first one
auto subArrsListOut = output . allTensorsAlongDimension ( dimensions ) ;
std : : vector < int > indices ( firstDim ) ;
std : : iota ( indices . begin ( ) , indices . end ( ) , 0 ) ;
bool isZeroShuffled = false ;
2019-07-20 07:58:44 +02:00
//PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
for ( int i = firstDim - 1 ; i > 0 ; - - i ) {
int r = rng . relativeInt ( i ) % i ;
2019-06-06 14:21:15 +02:00
subArrsListOut - > at ( i ) - > assign ( subArrsListIn - > at ( indices [ r ] ) ) ;
if ( r = = 0 )
isZeroShuffled = true ;
if ( i = = r )
continue ;
subArrsListOut - > at ( r ) - > assign ( subArrsListIn - > at ( indices [ i ] ) ) ;
math : : nd4j_swap < int > ( indices [ i ] , indices [ r ] ) ;
}
if ( ! isZeroShuffled )
subArrsListOut - > at ( 0 ) - > assign ( subArrsListIn - > at ( 0 ) ) ;
delete subArrsListOut ;
}
rng . rewindH ( firstDim - 1 ) ;
delete subArrsListIn ;
}
}
2019-07-20 07:58:44 +02:00
void randomShuffle ( nd4j : : LaunchContext * context , NDArray & input , NDArray & output , nd4j : : graph : : RandomGenerator & rng , const bool isInplace ) {
2019-06-06 14:21:15 +02:00
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , randomShuffle_ , ( input , output , rng , isInplace ) , LIBND4J_TYPES ) ;
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
void pad_ ( const int mode , const NDArray & input , const NDArray & paddings , NDArray & output , const NDArray & padValue ) {
const T * x = input . bufferAsT < T > ( ) ;
T * z = output . bufferAsT < T > ( ) ;
const Nd4jLong * xShape = input . shapeOf ( ) ;
const Nd4jLong * zShape = output . shapeOf ( ) ;
const int rank = input . rankOf ( ) ; // both input and output have the same rank
const int rankMinusOne = rank - 1 ;
const auto zLen = output . lengthOf ( ) ;
if ( mode = = 0 ) { // CONSTANT case
const T padVal = padValue . e < T > ( 0 ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords [ MAX_RANK ] ;
for ( auto i = start ; i < stop ; i + = increment ) {
shape : : index2coords ( i , output . getShapeInfo ( ) , coords ) ;
const auto zOffset = shape : : getOffset ( output . getShapeInfo ( ) , coords ) ;
bool within = true ;
for ( int j = rankMinusOne ; j > = 0 ; - - j ) {
if ( xShape [ j ] = = zShape [ j ] ) continue ;
const auto left = paddings . e < Nd4jLong > ( j , 0 ) ;
if ( coords [ j ] < left | | coords [ j ] > = left + xShape [ j ] ) {
within = false ;
break ;
}
else { coords [ j ] = coords [ j ] - left ; }
}
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( within )
z [ zOffset ] = x [ shape : : getOffset ( input . getShapeInfo ( ) , coords ) ] ;
else
z [ zOffset ] = padVal ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
samediff : : Threads : : parallel_tad ( func , 0 , zLen ) ;
2019-06-06 14:21:15 +02:00
}
else { // REFLECT and SYMMETRIC cases
const Nd4jLong shift1 = mode = = 1 ? 0 : 1 ; // REFLECT : SYMMETRIC
const Nd4jLong shift2 = mode = = 1 ? 2 : 1 ; // REFLECT : SYMMETRIC
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords [ MAX_RANK ] ;
for ( auto i = start ; i < stop ; i + = increment ) {
shape : : index2coords ( i , output . getShapeInfo ( ) , coords ) ;
const auto zOffset = shape : : getOffset ( output . getShapeInfo ( ) , coords ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
for ( int j = rankMinusOne ; j > = 0 ; - - j ) {
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( xShape [ j ] = = zShape [ j ] ) continue ;
coords [ j ] = coords [ j ] - paddings . e < Nd4jLong > ( j , 0 ) ; // are ready to fill middle (within input dimension range)
if ( coords [ j ] < 0 ) coords [ j ] = - coords [ j ] - shift1 ; // means fill from left
else if ( coords [ j ] > = xShape [ j ] ) coords [ j ] = 2 * xShape [ j ] - coords [ j ] - shift2 ; // means fill from right
}
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
const auto xOffset = shape : : getOffset ( input . getShapeInfo ( ) , coords ) ;
z [ zOffset ] = x [ xOffset ] ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
samediff : : Threads : : parallel_tad ( func , 0 , zLen ) ;
2019-06-06 14:21:15 +02:00
}
}
// //////////////////////////////////////////////////////////////////////////
// template<typename T>
// void pad2_(const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, NDArray const& padValue) {
// const int rank = output.rankOf();
// std::vector<int> dimsToExclude(rank);
// std::iota(dimsToExclude.begin(), dimsToExclude.end(), 0); // fill with 0, 1, ... rank-1
// Nd4jLong numLeft = paddings.e<Nd4jLong>(rank-1,0);
// Nd4jLong numRight = paddings.e<Nd4jLong>(rank-1,1);
// Nd4jLong inDimSize = input.sizeAt(rank-1);
// Nd4jLong outDimSize = output.sizeAt(rank-1);
// std::vector<std::vector<Nd4jLong>> outIdx = { std::vector<Nd4jLong>(2*rank), {numLeft, numLeft + inDimSize}, {0, numLeft}, {numLeft + inDimSize, outDimSize} };
// for(int i = 0; i < rank-1; ++i) {
// outIdx[0][2*i] = paddings.e<Nd4jLong>(i, 0);
// outIdx[0][2*i + 1] = outIdx[0][2*i] + input.sizeAt(i);
// }
// outIdx[0][2*rank-1] = outIdx[0][2*rank-2] = 0;
// // ***** populate innermost sub-arrays firstly ***** //
// dimsToExclude.pop_back();
// Nd4jLong startL = mode == 1 ? 1 : 0; // REFLECT or SYMMETRIC
// Nd4jLong startR = mode == 1 ? inDimSize-2 : inDimSize-1; // REFLECT or SYMMETRIC
// Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input.getShapeInfo(), dimsToExclude);
// NDArray outSubArr0 = output(outIdx[0], true);
// PRAGMA_OMP_PARALLEL_FOR
// for(Nd4jLong j = 0; j < numOfSubArrs; ++j) {
// NDArray outSubArr1 = outSubArr0(j, dimsToExclude);
// NDArray inSubArr = input(j, dimsToExclude);
// NDArray outSubArrMid = outSubArr1(outIdx[1]);
// outSubArrMid.assign(inSubArr); // assign middle
// if(mode == 0) { // CONSTANT
// if(numLeft != 0) {
// NDArray temp = outSubArr1(outIdx[2]);
// temp.assign(padValue); // assign left
// }
// if(numRight != 0) {
// NDArray temp = outSubArr1(outIdx[3]);
// temp.assign(padValue); // assign right
// }
// }
// else { // REFLECT or SYMMETRIC
// for(Nd4jLong k = numLeft-1, e = startL; k >= 0; --k, ++e) // fill left side
// outSubArr1.t<T>(k) = inSubArr.t<T>(e);
// for(Nd4jLong k = numLeft + inDimSize, e = startR; k < outDimSize; ++k, --e) // fill right side
// outSubArr1.t<T>(k) = inSubArr.t<T>(e);
// }
// }
// // ***** fill rest of outer sub-arrays ***** //
// std::vector<Nd4jLong> outIdxInner(2, 0);
// std::vector<Nd4jLong> outIdxOuter(2, 0);
// for(int i = rankBorder - 1; i >= 0; --i) {
// dimsToExclude.pop_back();
// outIdxInner.push_back(0), outIdxInner.push_back(0);
// outIdxOuter.push_back(0), outIdxOuter.push_back(0);
// Nd4jLong numLeft = paddings.e<Nd4jLong>(i, 0);
// Nd4jLong numRight = paddings.e<Nd4jLong>(i, 1);
// if(numLeft == 0 && numRight == 0)
// continue;
// Nd4jLong inDimSize = input.sizeAt(i);
// Nd4jLong outDimSize = output.sizeAt(i);
// if(mode == 0) {
// outIdxOuter[0] = 0; outIdxOuter[1] = numLeft;
// outIdxInner[0] = numLeft + inDimSize; outIdxInner[1] = outDimSize;
// }
// startL = mode == 1 ? numLeft + 1 : numLeft; // REFLECT or SYMMETRIC
// startR = mode == 1 ? numLeft + inDimSize - 2 : numLeft + inDimSize-1; // REFLECT or SYMMETRIC
// numOfSubArrs = ShapeUtils::getNumOfSubArrs(output.getShapeInfo(), dimsToExclude);
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(outIdxOuter, outIdxInner))
// for(Nd4jLong j = 0; j < numOfSubArrs; ++j) {
// NDArray outSubArr = output(j, dimsToExclude);
// if(mode == 0) { // CONSTANT
// if(numLeft != 0) {
// NDArray tempO = outSubArr(outIdxOuter);
// tempO.assign(padValue); // assign left
// }
// if(numRight != 0) {
// NDArray tempI = outSubArr(outIdxInner);
// tempI.assign(padValue); // assign right
// }
// }
// else { // REFLECT or SYMMETRIC
// for(Nd4jLong k = numLeft-1, e = startL; k >= 0; --k, ++e) { // fill left side
// outIdxOuter[0] = k;
// outIdxOuter[1] = k+1;
// outIdxInner[0] = e;
// outIdxInner[1] = e+1;
// NDArray outSubArrInner = outSubArr(outIdxInner);
// NDArray outSubArrOuter = outSubArr(outIdxOuter);
// outSubArrOuter.assign(outSubArrInner);
// }
// for(Nd4jLong k = numLeft + inDimSize, e = startR; k < outDimSize; ++k, --e) { // fill right side
// outIdxOuter[0] = k;
// outIdxOuter[1] = k+1;
// outIdxInner[0] = e;
// outIdxInner[1] = e+1;
// NDArray outSubArrInner = outSubArr(outIdxInner);
// NDArray outSubArrOuter = outSubArr(outIdxOuter);
// outSubArrOuter.assign(outSubArrInner);
// }
// }
// }
// }
// }
void pad ( nd4j : : LaunchContext * context , const int mode , const NDArray & input , const NDArray & paddings , NDArray & output , NDArray const & padValue ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , pad_ , ( mode , input , paddings , output , padValue ) , LIBND4J_TYPES ) ;
}
////////////////////////////////////////////////////////////////////////
/*// initial values of inIdx, outIdx, dim must be equal to zero
template < typename T >
static void recursiveLoopForPad_ ( const int mode , NDArray & input , const NDArray & paddings , NDArray & output , std : : vector < int > dimensions , int dim , int inIdx , int outIdx , NDArray & padValue ) {
int leftOffset ;
// dimensions are array of input dimensions, it is sorted in increasing order
// every time at the beginning we erase first element from it (not good idea to use vector for this purpose, but luckily it is small enough)
// then we use this array for tads building, every time while recursion the number of built tads becomes bigger
dimensions . erase ( dimensions . begin ( ) ) ;
// build tad basing on output array, also create auxiliary arrays pointing on required output array ranges
shape : : TAD tadOut ( output . getShapeInfo ( ) , dimensions . data ( ) , dimensions . size ( ) ) ;
tadOut . createTadOnlyShapeInfo ( ) ;
tadOut . createOffsets ( ) ;
auto subArrOut = NDArray ( output . getBuffer ( ) , tadOut . tadOnlyShapeInfo , output . getContext ( ) ) ;
auto subArr = NDArray ( output . getBuffer ( ) , tadOut . tadOnlyShapeInfo , output . getContext ( ) ) ;
// build tad basing on input array, also create auxiliary array pointing on required input array range
shape : : TAD tadIn ( input . getShapeInfo ( ) , dimensions . data ( ) , dimensions . size ( ) ) ;
tadIn . createTadOnlyShapeInfo ( ) ;
tadIn . createOffsets ( ) ;
auto subArrIn = NDArray ( input . getBuffer ( ) , tadIn . tadOnlyShapeInfo , output . getContext ( ) ) ;
// these indices take into account recursion and always point to actual tads numbers
if ( input . rankOf ( ) > 1 & & output . rankOf ( ) > 1 ) { // only for non-vector cases
outIdx = outIdx * output . sizeAt ( dim + 1 ) ;
inIdx = inIdx * input . sizeAt ( dim + 1 ) ;
}
// current input tad number, we add to it unity in a loop
int k = - 1 ;
// loop through current dimension
for ( int i = 0 ; i < output . sizeAt ( dim ) ; + + i ) {
// corresponds to outer range (relevant indices are absent in input)
leftOffset = paddings . e < int > ( dim , 0 ) ;
if ( i < leftOffset | | i > = ( input . sizeAt ( dim ) + leftOffset ) )
continue ;
// increase input tads number
+ + k ;
// recursion condition allows for the fact that tad can't reduce to scalar
if ( dim < input . rankOf ( ) - 2 )
recursiveLoopForPad ( mode , input , paddings , output , dimensions , dim + 1 , inIdx + k , outIdx + i , padValue ) ;
else if ( paddings . sizeAt ( 0 ) > dim + 1 ) {
leftOffset = paddings . e < int > ( dim + 1 , 0 ) ;
// shift buffers pointers to actual element position
if ( output . rankOf ( ) > 1 ) {
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + i ] ) ;
subArrIn . setBuffer ( reinterpret_cast < T * > ( input . getBuffer ( ) ) + tadIn . tadOffsets [ inIdx + i - paddings . e < int > ( dim , 0 ) ] ) ;
}
else {
subArrOut . p ( i , subArrIn . e < T > ( i - leftOffset ) ) ;
}
// most inner loop, corresponds to last dim = rank-1
switch ( mode ) {
case 0 : // CONSTANT mode
for ( int j = 0 ; j < subArrOut . lengthOf ( ) ; + + j )
if ( j < leftOffset | | j > = ( subArrIn . lengthOf ( ) + leftOffset ) ) // firstly fill with zeros outer ranges
subArrOut . p ( j , ( T ) 0.f ) ;
else
subArrOut . p ( j , subArrIn . e < T > ( j - leftOffset ) ) ; // fill middle with elements of input array
break ;
case 1 : // REFLECT mode
for ( int j = 1 ; j < = leftOffset ; + + j ) // fill firstly left side
subArrOut . p ( leftOffset - j , subArrIn . e < T > ( j ) ) ;
for ( int j = 0 ; j < subArrIn . lengthOf ( ) ; + + j ) // fill middle
subArrOut . p ( leftOffset + j , subArrIn . e < T > ( j ) ) ;
for ( int j = ( subArrOut . lengthOf ( ) - leftOffset ) ; j < subArrOut . lengthOf ( ) ; + + j ) // fill right side
subArrOut . p ( j , subArrIn . e < T > ( subArrOut . lengthOf ( ) - j - 1 ) ) ;
break ;
case 2 : // SYMMETRIC mode
for ( int j = 1 ; j < = leftOffset ; + + j ) // fill firstly left side
subArrOut . p ( leftOffset - j , subArrIn . e < T > ( j - 1 ) ) ;
for ( int j = 0 ; j < subArrIn . lengthOf ( ) ; + + j ) // fill middle
subArrOut . p ( leftOffset + j , subArrIn . e < T > ( j ) ) ;
for ( int j = ( subArrOut . lengthOf ( ) - leftOffset ) ; j < subArrOut . lengthOf ( ) ; + + j ) // fill right side
subArrOut . p ( j , subArrIn . e < T > ( subArrOut . lengthOf ( ) - j ) ) ;
break ;
}
}
else {
if ( mode = = 0 & & input . rankOf ( ) < 2 )
subArrOut . p ( i , subArrIn . e < T > ( i - leftOffset ) ) ; // fill middle with elements of input array
}
}
// populate sub-array formed previously
leftOffset = paddings . e < int > ( dim , 0 ) ;
switch ( mode ) {
case 0 : // CONSTANT mode
for ( int j = 1 ; j < = leftOffset ; + + j ) {
// fill left side with padValue
if ( output . rankOf ( ) > 1 ) {
subArrOut . setBuffer (
reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + leftOffset - j ] ) ;
subArrOut . assign ( padValue ) ;
}
else {
subArrOut . p ( j - 1 , padValue ) ;
}
}
// output.printIndexedBuffer("Output at");
for ( int j = ( output . sizeAt ( dim ) - leftOffset ) ; j < output . sizeAt ( dim ) ; + + j ) { // fill left side with zeros
if ( output . rankOf ( ) > 1 ) {
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + j ] ) ;
subArrOut . assign ( padValue ) ;
}
else {
subArrOut . p ( j , padValue ) ;
}
}
break ;
case 1 : // REFLECT mode
for ( int j = 1 ; j < = leftOffset ; + + j ) { // fill left side
subArr . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + leftOffset + j ] ) ;
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + leftOffset - j ] ) ;
subArrOut . assign ( & subArr ) ;
}
for ( int j = ( output . sizeAt ( dim ) - leftOffset ) ; j < output . sizeAt ( dim ) ; + + j ) { // fill right side
subArr . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + output . sizeAt ( dim ) + leftOffset - 1 - j ] ) ;
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + j ] ) ;
subArrOut . assign ( & subArr ) ;
}
break ;
case 2 : // SYMMETRIC mode
for ( int j = 1 ; j < = leftOffset ; + + j ) { // fill left side
subArr . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + leftOffset + j - 1 ] ) ;
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + leftOffset - j ] ) ;
subArrOut . assign ( & subArr ) ;
}
for ( int j = ( output . sizeAt ( dim ) - leftOffset ) ; j < output . sizeAt ( dim ) ; + + j ) { // fill right side
subArr . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + output . sizeAt ( dim ) + leftOffset - j ] ) ;
subArrOut . setBuffer ( reinterpret_cast < T * > ( output . getBuffer ( ) ) + tadOut . tadOffsets [ outIdx + j ] ) ;
subArrOut . assign ( & subArr ) ;
}
break ;
}
}
*/
/*
void recursiveLoopForPad ( const int mode , NDArray & input , const NDArray & paddings , NDArray & output , std : : vector < int > dimensions , int dim , int inIdx , int outIdx , NDArray & padValue ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , recursiveLoopForPad_ , ( mode , input , paddings , output , dimensions , dim , inIdx , outIdx , padValue ) , LIBND4J_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void recursiveLoopForPad_ , ( const int mode , NDArray & input , const NDArray & paddings , NDArray & output , std : : vector < int > dimensions , int dim , int inIdx , int outIdx , NDArray & padValue ) , LIBND4J_TYPES ) ;
*/
////////////////////////////////////////////////////////////////////////
void invertPermutation ( nd4j : : LaunchContext * context , const NDArray & input , NDArray & output ) {
std : : set < int > uniqueElems ;
const int length = input . lengthOf ( ) ;
for ( int i = 0 ; i < length ; + + i ) {
int elem = input . e < int > ( i ) ;
if ( ! uniqueElems . insert ( elem ) . second ) // this operation forbids us to use #pragma omp
throw std : : runtime_error ( " helpers::invertPermutation function: input array contains duplicates ! " ) ;
if ( elem < 0 | | elem > length - 1 )
throw std : : runtime_error ( " helpers::invertPermutation function: element of input array is out of range (0, length-1) ! " ) ;
output . p < int > ( elem , i ) ;
}
}
////////////////////////////////////////////////////////////////////////
2019-08-02 19:01:03 +02:00
template < typename X , typename Y >
2019-06-06 14:21:15 +02:00
static void gatherND_ ( NDArray & input , NDArray & indices , NDArray & output ) {
2019-08-02 19:01:03 +02:00
const X * x = reinterpret_cast < X * > ( input . getBuffer ( ) ) ;
const Y * y = reinterpret_cast < Y * > ( indices . getBuffer ( ) ) ;
X * z = reinterpret_cast < X * > ( output . getBuffer ( ) ) ;
2019-06-06 14:21:15 +02:00
2019-08-02 19:01:03 +02:00
const int xRank = input . rankOf ( ) ;
const int yRank = indices . rankOf ( ) ;
const int zRank = output . rankOf ( ) ;
const int maxRank = nd4j : : math : : nd4j_max < int > ( yRank , nd4j : : math : : nd4j_max < int > ( xRank , zRank ) ) ;
2019-06-06 14:21:15 +02:00
2019-08-02 19:01:03 +02:00
const Nd4jLong zLen = output . lengthOf ( ) ;
2019-06-06 14:21:15 +02:00
2019-08-02 19:01:03 +02:00
const int yLastDim = indices . sizeAt ( - 1 ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords [ MAX_RANK * 3 ] ;
for ( auto i = start ; i < stop ; i + = increment ) {
Nd4jLong * zCoordStart , * xCoordStart ;
if ( yLastDim = = xRank ) {
zCoordStart = coords ;
xCoordStart = coords ;
} else if ( zRank > = xRank ) {
zCoordStart = coords ;
xCoordStart = coords + zRank - xRank ;
} else {
zCoordStart = coords + xRank - zRank ;
xCoordStart = coords ;
}
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
shape : : index2coords ( i , output . getShapeInfo ( ) , zCoordStart ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
const auto zOffset = shape : : getOffset ( output . getShapeInfo ( ) , zCoordStart ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
// last y coordinate
uint coordToRestore ;
if ( yLastDim ! = xRank )
coordToRestore = static_cast < uint > ( zCoordStart [ yRank - 1 ] ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
zCoordStart [ yRank - 1 ] = 0 ;
const auto yOffset = shape : : getOffset ( indices . getShapeInfo ( ) , zCoordStart ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
//restore z coordinate
if ( yLastDim ! = xRank )
zCoordStart [ yRank - 1 ] = coordToRestore ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
// construct coordinates for x
for ( uint j = 0 ; j < yLastDim ; + + j )
xCoordStart [ j ] = y [ yOffset + j * indices . stridesOf ( ) [ yRank - 1 ] ] ; // last stride
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
const auto xOffset = shape : : getOffset ( input . getShapeInfo ( ) , xCoordStart ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
z [ zOffset ] = x [ xOffset ] ;
}
} ;
2019-08-02 19:01:03 +02:00
2019-11-13 15:15:18 +01:00
samediff : : Threads : : parallel_tad ( func , 0 , zLen ) ;
2019-08-02 19:01:03 +02:00
}
2019-06-06 14:21:15 +02:00
2019-08-02 19:01:03 +02:00
////////////////////////////////////////////////////////////////////////
void gatherND ( nd4j : : LaunchContext * context , NDArray & input , NDArray & indices , NDArray & output ) {
[WIP] multi-device support (#80)
* fix pad javadoc and @see links. (#72)
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* [WIP] More fixes (#73)
* special tests for ConstantTadHelper/ConstantShapeHelper
Signed-off-by: raver119 <raver119@gmail.com>
* release methods for data buffers
Signed-off-by: raver119 <raver119@gmail.com>
* delete temporary buffer Java side
Signed-off-by: raver119 <raver119@gmail.com>
* delete temporary buffer Java side
Signed-off-by: raver119 <raver119@gmail.com>
* delete temporary TadPack C++/Java side (#74)
Signed-off-by: raver119 <raver119@gmail.com>
* Zoo model TF import test updates (#75)
* argLine fix, update compression_gru comment
* updated comment for xception
* undid but commented argLine change
* updated xlnet comment
* copyright headers
* - new NDArray methods like()/ulike() (#77)
- fix for depthwise_conv2d_bp + special test
Signed-off-by: raver119 <raver119@gmail.com>
* upsampling2d fix CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* DL4J trace logging (#79)
* MLN/CG trace logging for debugging
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Tiny tweak
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* strided_slice_bp shape fn leak fix
Signed-off-by: raver119 <raver119@gmail.com>
* SameDiff fixes and naming (#78)
* remove SDVariable inplace methods
* import methods
* npe fix in OpVal
* removed SameDiff inplace ops from tests
* Naming updates, moved to centralized methods in SameDiff, should use op_#:# for everything
* quick fixes
* javadoc
* SDVariable eval with placeholders
* use regex match
* better matching
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* fix javadoc. (#76)
* fix javadoc.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* replace most @see with @link s.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* 4 additional tests
Signed-off-by: raver119 <raver119@gmail.com>
* launch context reorganization
Signed-off-by: raver119 <raver119@gmail.com>
* LaunchContext reorganization
Signed-off-by: raver119 <raver119@gmail.com>
* per-device LaunchContext
Signed-off-by: raver119 <raver119@gmail.com>
* Various DL4J/ND4J fixes (#81)
* #7954 Force refresh of UI when switching tabs on overview page
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8017 Concurrent modification exception (synchronize) fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8033 Don't initialize updater in middle of writing memory crash dump
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8208 Fix shape checks for ND4J int[] creator methods
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #6385 #7992 Keras import naming fixes + cleanup
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8016 Upsampling3D - add NDHWC format support
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* ContextBuffers as separate entity
Signed-off-by: raver119 <raver119@gmail.com>
* Refactor NativeOps.h to export C functions
* Actually export functions from NativeOps.h
* Adapt the Java wrappers in ND4J generated with JavaCPP
* Create C wrappers for some of the C++ classes currently used by ND4J
* ContextBuffers as separate entity
Signed-off-by: raver119 <raver119@gmail.com>
* remove duplicate code in createBufferDetached. (#83)
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* Keras model import - updater lr fix (#84)
* Keras model import - updater lr fix
Signed-off-by: eraly <susan.eraly@gmail.com>
* Keras model import - updater lr fix, cleanup
Signed-off-by: eraly <susan.eraly@gmail.com>
* ContextBuffers as separate entity
Signed-off-by: raver119 <raver119@gmail.com>
* ContextBuffers as separate entity
Signed-off-by: raver119 <raver119@gmail.com>
* Fix functions of OpaqueVariablesSet
* thread-local buffers/affinity
Signed-off-by: raver119 <raver119@gmail.com>
* thread safety for LaunchContext
Signed-off-by: raver119 <raver119@gmail.com>
* more of thread safety
Signed-off-by: raver119 <raver119@gmail.com>
* one more multi threaded test
Signed-off-by: raver119 <raver119@gmail.com>
* SameDiff Convolution Config validation, better output methods (#82)
* Conv Config validation & tests
Signed-off-by: Ryan Nett <rnett@skymind.io>
* stackOutputs utility method
Signed-off-by: Ryan Nett <rnett@skymind.io>
* use constructor for validation, support negative kernel sizes (infered from weights)
Signed-off-by: Ryan Nett <rnett@skymind.io>
* better output methods
Signed-off-by: Ryan Nett <rnett@skymind.io>
* move output to be with fit and evaluate
Signed-off-by: Ryan Nett <rnett@skymind.io>
* fixes
Signed-off-by: Ryan Nett <rnett@skymind.io>
* more fixes
Signed-off-by: Ryan Nett <rnett@skymind.io>
* refactor duplicate code from pad methods. (#86)
* refactor duplicate code from pad methods.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* replace switch with if.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* Various ND4J/DL4J fixes and improvements (#87)
* Reshape and reallocate - small fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Reshape and reallocate - small fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #6488 ElementWiseVertex broadcast support
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Constructors and broadcast supported it Transforms.max/min
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8054 ElementWiseVertex now supports broadcast inputs
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8057 Nd4j.create overload dtype fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7551 ND4J Shape validation fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] Numpy boolean import (#91)
* numpy bool type
Signed-off-by: raver119 <raver119@gmail.com>
* numpy bool java side
Signed-off-by: raver119 <raver119@gmail.com>
* remove create method with unused parameter. (#89)
* remove create method with unused parameter.
* removed more unused methods.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* removing more unused code.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* last removal of unused code.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* remove createSparse methods. (#92)
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* Various ND4J/DL4J fixes (#90)
* Deprecate Old*Op instances
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8063 #8054 Broadcast exceptions + cleanup inplace ops
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Small fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Remove bad test condition
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7993 Fix shape function issue in crop_and_resize op
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* DL4J SameDiff lambda layer fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8029 Fix for pnorm backprop math
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #8038 Fix Op profiler NaN/Inf triggering + add tests (#93)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* createUninitializedDetached refactoring. (#94)
* wip
* update interface, add null implementations.
* Breaking one test in a weird way.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* createUninitializedDetached refactored.
Signed-off-by: Robert Altena <Rob@Ra-ai.com>
* cuda build fix for issues introduced by recent refactoring
Signed-off-by: raver119 <raver119@gmail.com>
* [WIP] More of CUDA (#95)
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* Implementation of hashcode cuda helper. Working edition.
* Fixed parallel test input arangements.
* Fixed tests for hashcode op.
* Fixed shape calculation for image:crop_and_resize op and test.
* NativeOps tests. Initial test suite.
* Added tests for indexReduce methods.
* Added test on execBroadcast with NDArray as dimensions.
* Added test on execBroadcastBool with NDArray as dimensions.
* Added tests on execPairwiseTransform and execPairwiseTransofrmBool.
* Added tests for execReduce with scalar results.
* Added reduce tests for non-empty dims array.
* Added tests for reduce3.
* Added tests for execScalar.
* Added tests for execSummaryStats.
* - provide cpu/cuda code for batch_to_space
- testing it
Signed-off-by: Yurii <yurii@skymind.io>
* - remove old test for batch_to_space (had wrong format and numbers were not checked)
Signed-off-by: Yurii <yurii@skymind.io>
* Fixed complilation errors with test.
* Added test for execTransformFloat.
* Added test for execTransformSame.
* Added test for execTransformBool.
* Added test for execTransformStrict.
* Added tests for execScalar/execScalarBool with TADs.
* Added test for flatten.
* - provide cpu/cuda code for space_to_Batch operaion
Signed-off-by: Yurii <yurii@skymind.io>
* Added test for concat.
* comment unnecessary stuff in s_t_b
Signed-off-by: Yurii <yurii@skymind.io>
* Added test for specialConcat.
* Added tests for memcpy/set routines.
* Fixed pullRow cuda test.
* Added pullRow test.
* Added average test.
* - correct typo in NDArray::applyPairwiseTransform(nd4j::pairwise::BoolOps op...)
Signed-off-by: Yurii <yurii@skymind.io>
* - debugging and fixing cuda tests in JavaInteropTests file
Signed-off-by: Yurii <yurii@skymind.io>
* - correct some tests
Signed-off-by: Yurii <yurii@skymind.io>
* Added test for shuffle.
* Fixed ops declarations.
* Restored omp and added shuffle test.
* Added convertTypes test.
* Added tests for execRandom. Eliminated usage of RandomBuffer with NativeOps.
* Added sort tests.
* Added tests for execCustomOp.
* - further debuging and fixing tests terminated with crash
Signed-off-by: Yurii <yurii@skymind.io>
* Added tests for calculateOutputShapes.
* Addded Benchmarks test.
* Commented benchmark tests.
* change assertion
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for apply_sgd op. Added cpu helper for that op.
* Implement cuda helper for aplly_sgd op. Fixed tests for NativeOps.
* Added test for assign broadcastable.
* Added tests for assign_bp op.
* Added tests for axpy op.
* - assign/execScalar/execTransformAny signature change
- minor test fix
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed axpy op.
* meh
Signed-off-by: raver119 <raver119@gmail.com>
* - fix tests for nativeOps::concat
Signed-off-by: Yurii <yurii@skymind.io>
* sequential transform/scalar
Signed-off-by: raver119 <raver119@gmail.com>
* allow nested parallelism
Signed-off-by: raver119 <raver119@gmail.com>
* assign_bp leak fix
Signed-off-by: raver119 <raver119@gmail.com>
* block setRNG fix
Signed-off-by: raver119 <raver119@gmail.com>
* enable parallelism by default
Signed-off-by: raver119 <raver119@gmail.com>
* enable nested parallelism by default
Signed-off-by: raver119 <raver119@gmail.com>
* Added cuda implementation for row_count helper.
* Added implementation for tnse gains op helper.
* - take into account possible situations when input arrays are empty in reduce_ cuda stuff
Signed-off-by: Yurii <yurii@skymind.io>
* Implemented tsne/edge_forces op cuda-based helper. Parallelized cpu-based helper for edge_forces.
* Added kernel for tsne/symmetrized op heleper.
* Implementation of tsne/symmetrized op cuda helper. Working edition.
* Eliminated waste printfs.
* Added test for broadcastgradientargs op.
* host-only fallback for empty reduce float
Signed-off-by: raver119 <raver119@gmail.com>
* - some tests fixes
Signed-off-by: Yurii <yurii@skymind.io>
* - correct the rest of reduce_ stuff
Signed-off-by: Yurii <yurii@skymind.io>
* - further correction of reduce_ stuff
Signed-off-by: Yurii <yurii@skymind.io>
* Added test for Cbow op. Also added cuda implementation for cbow helpers.
* - improve code of stack operation for scalar case
Signed-off-by: Yurii <yurii@skymind.io>
* - provide cuda kernel for gatherND operation
Signed-off-by: Yurii <yurii@skymind.io>
* Implementation of cbow helpers with cuda kernels.
* minor tests tweaks
Signed-off-by: raver119 <raver119@gmail.com>
* minor tests tweaks
Signed-off-by: raver119 <raver119@gmail.com>
* - further correction of cuda stuff
Signed-off-by: Yurii <yurii@skymind.io>
* Implementatation of cbow op helper with cuda kernels. Working edition.
* Skip random testing for cudablas case.
* lstmBlockCell context fix
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for ELU and ELU_BP ops.
* Added tests for eq_scalar, gt_scalar, gte_scalar and lte_scalar ops.
* Added tests for neq_scalar.
* Added test for noop.
* - further work on clipbynorm_bp
Signed-off-by: Yurii <yurii@skymind.io>
* - get rid of concat op call, use instead direct concat helper call
Signed-off-by: Yurii <yurii@skymind.io>
* lstmBlockCell context fix
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for lrelu and lrelu_bp.
* Added tests for selu and selu_bp.
* Fixed lrelu derivative helpers.
* - some corrections in lstm
Signed-off-by: Yurii <yurii@skymind.io>
* operator * result shape fix
Signed-off-by: raver119 <raver119@gmail.com>
* - correct typo in lstmCell
Signed-off-by: Yurii <yurii@skymind.io>
* few tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* CUDA inverse broadcast bool fix
Signed-off-by: raver119 <raver119@gmail.com>
* disable MMAP test for CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* BooleanOp syncToDevice
Signed-off-by: raver119 <raver119@gmail.com>
* meh
Signed-off-by: raver119 <raver119@gmail.com>
* additional data types for im2col/col2im
Signed-off-by: raver119 <raver119@gmail.com>
* Added test for firas_sparse op.
* one more RandomBuffer test excluded
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for flatten op.
* Added test for Floor op.
* bunch of tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* mmulDot tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* more tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* Implemented floordiv_bp op and tests.
* Fixed scalar case with cuda implementation for bds.
* - work on cuda kernel for clip_by_norm backprop op is completed
Signed-off-by: Yurii <yurii@skymind.io>
* Eliminate cbow crach.
* more tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* more tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* Eliminated abortion with batched nlp test.
* more tests fixed
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed shared flag initializing.
* disabled bunch of cpu workspaces tests
Signed-off-by: raver119 <raver119@gmail.com>
* scalar operators fix: missing registerSpecialUse call
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed logdet for cuda and tests.
* - correct clipBynorm_bp
Signed-off-by: Yurii <yurii@skymind.io>
* Fixed crop_and_resize shape datatype.
* - correct some mmul tests
Signed-off-by: Yurii <yurii@skymind.io>
* build fix
Signed-off-by: raver119 <raver119@gmail.com>
* exclude two methods for JNI
Signed-off-by: raver119 <raver119@gmail.com>
* exclude two methods for JNI
Signed-off-by: raver119 <raver119@gmail.com>
* exclude two methods for JNI (#97)
Signed-off-by: raver119 <raver119@gmail.com>
* temporary stack fix
Signed-off-by: raver119 <raver119@gmail.com>
* round robin affinity test
Signed-off-by: raver119 <raver119@gmail.com>
* get rid of legacy CudaContext methods
Signed-off-by: raver119 <raver119@gmail.com>
* get rid of legacy ContextPool classes/methods
Signed-off-by: raver119 <raver119@gmail.com>
* one legacy test removed
Signed-off-by: raver119 <raver119@gmail.com>
* few more fields rearranged
Signed-off-by: raver119 <raver119@gmail.com>
* OpaqueLaunchContext
Signed-off-by: raver119 <raver119@gmail.com>
* OpaqueLaunchContext++
Signed-off-by: raver119 <raver119@gmail.com>
* more of OpaqueLaunchContext methods
Signed-off-by: raver119 <raver119@gmail.com>
* LaunchContext -> CudaContext
Signed-off-by: raver119 <raver119@gmail.com>
* AffinityManger changes
Signed-off-by: raver119 <raver119@gmail.com>
* AffinityManger changes
Signed-off-by: raver119 <raver119@gmail.com>
* cusolver handles
Signed-off-by: raver119 <raver119@gmail.com>
* typo
Signed-off-by: raver119 <raver119@gmail.com>
* cusolver method
Signed-off-by: raver119 <raver119@gmail.com>
* cusolver handle propagated
Signed-off-by: raver119 <raver119@gmail.com>
* blas/solver handles
Signed-off-by: raver119 <raver119@gmail.com>
* one more test
Signed-off-by: raver119 <raver119@gmail.com>
* legacy concat implementations replaced with new CustomOp
Signed-off-by: raver119 <raver119@gmail.com>
* one more test
Signed-off-by: raver119 <raver119@gmail.com>
* concat now uses way more blocks
Signed-off-by: raver119 <raver119@gmail.com>
* print
Signed-off-by: raver119 <raver119@gmail.com>
* no more triple template mmul
Signed-off-by: raver119 <raver119@gmail.com>
* bunch of kernels have dtypes reconsidered
Signed-off-by: raver119 <raver119@gmail.com>
* bunch of kernels have dtypes reconsidered
Signed-off-by: raver119 <raver119@gmail.com>
* bitonic sort reorganized
Signed-off-by: raver119 <raver119@gmail.com>
* bunch of cpu stuff removed from cuda scope
Signed-off-by: raver119 <raver119@gmail.com>
* bunch of cpu stuff removed from cuda scope
Signed-off-by: raver119 <raver119@gmail.com>
* type conversions moved to generic impl
Signed-off-by: raver119 <raver119@gmail.com>
* cpu data types pass
Signed-off-by: raver119 <raver119@gmail.com>
* non_max_suppression
Signed-off-by: raver119 <raver119@gmail.com>
* sortByValue fix
Signed-off-by: raver119 <raver119@gmail.com>
* ignore all mixed datatype tests for mmul
Signed-off-by: raver119 <raver119@gmail.com>
* special handling of OpProfiler exceptions
Signed-off-by: raver119 <raver119@gmail.com>
* - one failing concat test in cpp
- Nd4j.tile now uses op internally
Signed-off-by: raver119 <raver119@gmail.com>
* get back dtype exception for legacy arrays deserialization
Signed-off-by: raver119 <raver119@gmail.com>
2019-08-14 15:52:34 +02:00
BUILD_DOUBLE_SELECTOR ( input . dataType ( ) , indices . dataType ( ) , gatherND_ , ( input , indices , output ) , LIBND4J_TYPES , INDEXING_TYPES ) ;
2019-08-02 19:01:03 +02:00
}
2019-06-06 14:21:15 +02:00
////////////////////////////////////////////////////////////////////////
template < typename T >
static void gather_ ( NDArray * input , const NDArray * indices , NDArray * output , const std : : vector < int > & intArgs ) {
int axis = intArgs . size ( ) > 0 ? intArgs [ 0 ] : 0 ;
const int inputRank = input - > rankOf ( ) ;
if ( axis < 0 )
axis + = inputRank ;
const int numOfIntArgs = intArgs . size ( ) ;
if ( indices ! = nullptr ) {
for ( int i = 0 ; i < indices - > lengthOf ( ) ; + + i )
if ( indices - > e < Nd4jLong > ( i ) > = input - > sizeAt ( axis ) )
throw std : : runtime_error ( " helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array ! " ) ;
// first case: indices consist of only one scalar
if ( indices - > isScalar ( ) ) {
if ( input - > rankOf ( ) < = 1 ) {
//For scalar indices, rank 0 or 1 input: can't do tensor along dimension 0 as this is whole array... instead, we want to get a scalar
auto idx = indices - > e < Nd4jLong > ( 0 ) ;
auto scalarNDArray = input - > e ( idx ) ;
output - > assign ( scalarNDArray ) ;
} else {
auto dimensions = ShapeUtils : : evalDimsToExclude ( input - > rankOf ( ) , { axis } ) ;
auto tadPack = nd4j : : ConstantTadHelper : : getInstance ( ) - > tadForDimensions ( input - > getShapeInfo ( ) , dimensions ) ;
auto tadArr = NDArray ( reinterpret_cast < void * > ( reinterpret_cast < T * > ( input - > getBuffer ( ) ) + tadPack . primaryOffsets ( ) [ indices - > e < Nd4jLong > ( 0 ) ] ) , tadPack . primaryShapeInfo ( ) , output - > getContext ( ) ) ;
output - > assign ( & tadArr ) ;
}
}
else if ( input - > rankOf ( ) = = 1 & & indices - > isVector ( ) ) {
// special case
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto e = start ; e < stop ; e + = increment )
output - > p ( e , input - > e < T > ( indices - > e < Nd4jLong > ( e ) ) ) ;
} ;
samediff : : Threads : : parallel_for ( func , 0 , indices - > lengthOf ( ) ) ;
2019-06-06 14:21:15 +02:00
}
else {
std : : vector < int > dimsOut ( indices - > rankOf ( ) ) ;
std : : iota ( dimsOut . begin ( ) , dimsOut . end ( ) , axis ) ; // fill with axis, axis+1, ... indices->rankOf()-1
const Nd4jLong numOfSubArrs = ShapeUtils : : getNumOfSubArrs ( output - > getShapeInfo ( ) , dimsOut ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
NDArray subArrOut = ( * output ) ( i , dimsOut ) ;
NDArray subArrIn = ( * input ) ( indices - > e < Nd4jLong > ( i ) , { axis } ) ;
subArrOut . assign ( subArrIn ) ;
}
} ;
samediff : : Threads : : parallel_tad ( func , 0 , numOfSubArrs ) ;
2019-06-06 14:21:15 +02:00
}
}
else {
for ( int i = 1 ; i < numOfIntArgs ; + + i )
if ( intArgs [ i ] > = input - > sizeAt ( axis ) )
throw std : : runtime_error ( " helpers::gather function: some of input indexes is larger than corresponding shape of input array ! " ) ;
// we only allow scalar/vector case here
if ( numOfIntArgs = = 2 ) { // scalar case
output - > assign ( ( * input ) ( intArgs [ 1 ] , { axis } ) ) ;
}
else { // vector case
const Nd4jLong numOfSubArrs = ShapeUtils : : getNumOfSubArrs ( output - > getShapeInfo ( ) , { axis } ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
NDArray subArrOut = ( * output ) ( i , { axis } ) ;
NDArray subArrIn = ( * input ) ( intArgs [ i + 1 ] , { axis } ) ;
subArrOut . assign ( subArrIn ) ;
}
} ;
samediff : : Threads : : parallel_tad ( func , 0 , numOfSubArrs ) ;
2019-06-06 14:21:15 +02:00
}
}
}
void gather ( NDArray * input , const NDArray * indices , NDArray * output , const std : : vector < int > & intArgs ) {
BUILD_SINGLE_SELECTOR ( input - > dataType ( ) , gather_ , ( input , indices , output , intArgs ) , LIBND4J_TYPES ) ;
}
//////////////////////////////////////////////////////////////////////////
void eye ( nd4j : : LaunchContext * context , NDArray & output ) {
const int rank = output . rankOf ( ) ;
auto arrs = output . allTensorsAlongDimension ( { rank - 2 , rank - 1 } ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment )
arrs - > at ( i ) - > setIdentity ( ) ;
} ;
samediff : : Threads : : parallel_tad ( func , 0 , arrs - > size ( ) ) ;
2019-06-06 14:21:15 +02:00
delete arrs ;
}
//////////////////////////////////////////////////////////////////////////
void scatterUpdate ( nd4j : : LaunchContext * context , NDArray & input , NDArray & updates , const std : : vector < int > * intArgs ) {
int opCode = ( * intArgs ) [ 0 ] ;
int dimSize = ( * intArgs ) [ 1 ] ;
Nd4jLong e ;
Nd4jLong limg = 2 + dimSize ;
std : : vector < int > tadDimensions ( dimSize ) ;
for ( e = 2 ; e < limg ; e + + )
tadDimensions [ e - 2 ] = ( * intArgs ) [ e ] ;
std : : vector < int > dimsToExclude = ShapeUtils : : evalDimsToExclude ( input . rankOf ( ) , tadDimensions ) ;
// increasing counter to skip numIndices
e + + ;
std : : vector < int > indices ;
for ( ; e < intArgs - > size ( ) ; e + + )
indices . push_back ( ( * intArgs ) [ e ] ) ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
auto inSubArr = input ( indices [ i ] , dimsToExclude , true ) ;
auto updSubArr = updates ( i , dimsToExclude , true ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( inSubArr . lengthOf ( ) ! = updSubArr . lengthOf ( ) )
2019-06-06 14:21:15 +02:00
continue ;
2019-11-13 15:15:18 +01:00
switch ( opCode ) {
case 0 :
inSubArr . applyPairwiseTransform ( pairwise : : Add , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 1 :
inSubArr . applyPairwiseTransform ( pairwise : : Subtract , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 2 :
inSubArr . applyPairwiseTransform ( pairwise : : Multiply , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 3 :
inSubArr . applyPairwiseTransform ( pairwise : : Divide , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 4 :
inSubArr . applyPairwiseTransform ( pairwise : : ReverseSubtract , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 5 :
inSubArr . applyPairwiseTransform ( pairwise : : ReverseDivide , & updSubArr , & inSubArr , nullptr ) ;
break ;
case 6 :
inSubArr . applyPairwiseTransform ( pairwise : : CopyPws , & updSubArr , & inSubArr , nullptr ) ;
break ;
default :
continue ;
}
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
samediff : : Threads : : parallel_tad ( func , 0 , indices . size ( ) ) ;
2019-06-06 14:21:15 +02:00
}
//////////////////////////////////////////////////////////////////////////
Merge master to upstream (#7945)
* Shugeo strided slice zeros (#14)
* Modified strided_slice op to properly work with empty-like shapes.
* Fixed test for reduce_mean with empty-like input.
* [WIP] Last merge (#15)
* correct logsoftmax looss (#2)
* Small SameDiff listener fix (#4)
* Various fixes (#6)
* #7839 Fix for asXMatrix and tests
* #7866 EmbeddingSequenceLayer dtype fix + test
* #7856 SameDiff save/load stream methods
* #7859 RegressionEvaluation rank 4 fix + tests + axis configuration
* EvaluationBinary 3d/4d
* More evaluation 3d/4d tests
* #7847 Evaluation empty checks
* Small test ifx
* #7848 Fix median edge case
* Improve DL4J samediff layer tests
* [WIP] FastText wrapper implemented (#8)
* FastText implemented
* Some fixes
* Fix shapes for wordsNearest
* Validation of input vectors
* Fixes
* Fixed test
* Thread tagged
* Some tweaks
* setContextClassLoader for DeallocatorServiceThread
* Numpy format tests (#1)
* Various fixes (#11)
* #7852 SameDiff gather fix
* #7892 SameDiff placeholder to constant conversion
* #7890 validate input rank for MLN/CG init methods
* Fix broken permute shape calculation
* Permute and gather fixes
* Tests
* #7850 LogSumExp fix + test
* Handful of test fixes
* Empty arrays with non-scalar shapes (#10)
* minor rearrangements for lambdas
* empty tensors with non-scalar shapes
* numpy empty tensors with non-scalar shapes
* few more empty tweaks
* Small fixes
* conv3d signature update
* micro fix in batchnorm mkldnn
* Import fixes
* Fix
* MKL-DNN update
* Small fill fix
* fill with empty input + test
* Fixes
* Small error improvement
* Fix
* one special test
* couple of fixes for lstm
* Rewrite TFGraphMapper.getNDArrayFromTensor to be maintainable and less error prone
* Fixes
* FP16
* Unsigned
* BFloat16
* Fill op - empty tweaks
* - couple of fixes for empty arrays construction
- stack updated
* strided slice fix
* one transform test
* provide method for reducing shapeInfo in case of input array is empty
* Fixed reduceAlongDimensions to use empty input properly.
* couple of broadcast tests
* couple of tests broadcast tests + tweak to make them pass
* add check of non-empty to methods producing sub-arrays
* Fixed reshapeC with zeros in shape.
* complete empty check in reduce_... legacy ops
* Concat and cumsum/prod
* Tweak to empty shape inference on import
* add empty check to the rest of reduce legacy ops
* one more test
* correct typo in evalReduceShapeInfoEmpty
* Added tests for reduce_* ops to tests with zero shapes.
* few more tests for empty reductions
* Fixed strided_slice op with empty case and tests.
* one more empty reduction test
* Fixed strided_slice test.
* add empty check to NDArray::reshapei
* infOrMax
* empty min/max with infinity tests
* made unstack working correctly with empty arrays
* few IndexReduce tests + tweaks for empty shapes
* add test for empty concat
* few tests fixed
* Validation fix for reductions on empty shapes
* Reverse fix
* Reduction shape calc fixes
* SameDiff.generateOutputVariable: don't use shape function to determine number of outputs
* Range fix
* - NDArray constructor updated for scalars/empty arrays
- few tests fixed
* More fixes
* Empty creator fixes
* concat fix
* concat fix
* TF import tests: allow 'both all NaN' and 'both all inf' to pass
* Slice, zero fraction, and reshape fixes
* transpose, gather
* Zero fraction
* scalar cast fix
* Empty reduction axis support
* few more tests fixed
* Fixed input checks conforming with TF for concat op and tests.
* few tests fixed
* matmul scalar shape fix
* Fixed checkout for data type and scalarity with concat to allow non-empty scalars with vector concats.
* broadcast bool fix
* few more tests
* few more tests
* correct evalReduceShapeInfoEmpty
* argmax/argmin + tests
* one more empty edge case + one more test
* argmax/argmin/realdiv_bp tweaks
* empty reshape test + fix
* Helper fixes
* Small fixes
* Gather test fix
* Gather test fix
* Small fixes
* reduce scalar zero values
* scalar mean workaround
* Remove debug code
* along dim mean workaround
* one more test
* - equalsTo() tweak for empty arrays
- one more test
* broadcast tweaks
* [WIP] Fixing outstanding issues for NLP (#9)
* Avoid using not-inited objects
* Test fixed.
* Redundant method avoided for models like FastText
* KMeans++ implementation
* KMeans++ implementation
* Disable parallel execution
* KMeans++
* Tests
* Dev branch merge (#16)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Fix some issues on master (#17)
* Fix DataVec test issue
* Fix issue with dl4j SameDiff output layer
* Dtype fix for lambda layers
* #7912 BertIterator dtype fix (use float32 not global default)
* [WIP] Next set of CUDA stuff (#7)
New CUDA implementations and improvements
* bad file
* Dev branch master merge (#23)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* SameDiff ops, TF import and fixes (#24)
* CheckNumerics tests + fixes + misc fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fake quant
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* FakeQuantWithMinMaxArgs
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* CheckNumerics fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix libnd4j ALL_INTS and ALL_FLOATS declaration (uint and bfloat types)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Small fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Javadoc
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Exception tweak
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix for out of scope stack allocated var use
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignores
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignore for known failing test (already logged issue)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Merge upstream to fork (#25)
* Add thousand-separator commas to TotalParams (#7915)
* Add thousand-separator commas to TotalParams
The number of parameters can be quite large, and it would help the reading of the summary printout to have the TotalParams column & values at the bottom have thousand-separator-commas in them.
* Add thousand-separator commas to MultiLayerNetwork
Corresponding change to MultiLayerNetwork
Signed-off-by: Jxtps Jxtps <jxtps435@gmail.com>
* Update contributing and issue/PR templates (#7934)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix link to AdaDelta paper (#7942)
Fix link to AdaDelta paper hosted on matthewzeiler.com
Signed-off-by: Jxtps
* Fixes, and ignores for known/logged failing issues (#7943)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* SameDiff + DL4J/SameDiff: Multiple fixes (#28)
* #7919 HDF5 attribute buffer length fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7909 Arbiter constructor exception ux improvements
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7925 RNN output layer length checks
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Add listener for validating inputs are not incorrectly modified
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Integrate NonInplaceValidationListener into tests
* #7844 DL4J SameDiff fixes for variable minibatch size
* DL4J SameDiff fixes - ensure gradient for input placeholder is available
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Tweaks to ExternalErrorsFunction - use placeholders, make more robust
* Another fix
* More fixes
* More SameDiff/DL4J fixes
* Scope out scalar array creation in BaseScalarOp
* Remove debug code
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] Final dev branch merge (#29)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* [WIP] Multiple dataset iterators (#27)
* Splitting dataset into arbitrary number
* Fixes
* Multiple split of iterator
* Test
* Test
* Some fixes
* signature change
* one more tweak
Signed-off-by: raver119 <raver119@gmail.com>
* one more test for sequential use of DataSetIteratorSplitter
Signed-off-by: raver119 <raver119@gmail.com>
* Fixes
* Fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* minor test fix
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* couple of assertions tweaked
Signed-off-by: raver119 <raver119@gmail.com>
* MDS splitter test :/
Signed-off-by: raver119 <raver119@gmail.com>
* Minor refactoring
* Multi dataset
* Some fixes
* More tests
* Small number of test fixes/improvements (failures on CI) (#31)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] More CUDA stuff (#26)
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* LRN BP CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* less memory
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed bug with crop_and_resize op helper.
* get rid of unnecessary index-calculation dunction
Signed-off-by: Yurii <yurii@skymind.io>
* Fixed sort with nth_element cuda-based helper.
* Refactored nth_element.
* Refactored nth_element op and tests.
* Modified usage of dim array with sortTad routine.
* Refactored main routine of helper for non_max_image_suppression op.
* non_max_image_suppression op helper with cuda kernel implementation. Initial revision.
* fix vol2col cuda kernel
* meh
Signed-off-by: raver119 <raver119@gmail.com>
* topK concept
Signed-off-by: raver119 <raver119@gmail.com>
* unsorted topK with scanWitdh of 1
Signed-off-by: raver119 <raver119@gmail.com>
* correct vol2col tests
* sorted/unsorted topK
Signed-off-by: raver119 <raver119@gmail.com>
* implementation and fixing col2im/col2vol
* Corrected usage flags with input/output with reverse op.
* dup is const now
Signed-off-by: raver119 <raver119@gmail.com>
* percentile op
Signed-off-by: raver119 <raver119@gmail.com>
* group tests for mapool2d
Signed-off-by: Yurii <yurii@skymind.io>
* special test for george
Signed-off-by: raver119 <raver119@gmail.com>
* less threads for sortTad
Signed-off-by: raver119 <raver119@gmail.com>
* provide conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* remove auther in sort tad kernel code
Signed-off-by: Yurii <yurii@skymind.io>
* provide depthwise_conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* - max_pooling_with_argmax
- null check for special use
Signed-off-by: raver119 <raver119@gmail.com>
* dts cuda
Signed-off-by: raver119 <raver119@gmail.com>
* provide sconv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* std cuda
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op to conform TF implementation.
* Improved suppression helper.
* provide pooling3d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* more of minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* (bi)dynamic_rnn
Signed-off-by: raver119 <raver119@gmail.com>
* templates init order
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op.
* Added cuda kernel for non_max_suppression.
* CPU sort by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value tests
Signed-off-by: raver119 <raver119@gmail.com>
* Eliminate compiler error with cuda implementation.
* - repaired gradCheck in cuda
- provide conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* missed signature
Signed-off-by: raver119 <raver119@gmail.com>
* provide depthwise_conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* Implementation of lup helper with cuda kernel. Initial commit.
* further work on backprops for convolutions
Signed-off-by: Yurii <yurii@skymind.io>
* CUDA linear sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* CUDA tad sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* start providing of backprop for pooling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* Added atomicAdd for bool datatype.
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition scalar CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* important comment
Signed-off-by: raver119 <raver119@gmail.com>
* fix pooling2d/3d backprop helpers
Signed-off-by: Yurii <yurii@skymind.io>
* Added non-linear test with dynamic_partition.
* Improved test for dynamic_partition.
* dynamic_partition TAD concept
Signed-off-by: raver119 <raver119@gmail.com>
* - dynamic_partition TAD CUDA impl
- dynamic_partition TAD CPU fix
Signed-off-by: raver119 <raver119@gmail.com>
* - rewrite cpu code for usampling2d/3d
- write cuda code for usampling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* dynamic_stitch CUDA vector case
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case impl
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for dynamic_stitch 3D-4D cases.
* minor tests tweaks
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed type check for dynamic stitch.
* min/max bp
Signed-off-by: raver119 <raver119@gmail.com>
* rewrite code for upsampling2d/3d cpu
Signed-off-by: Yurii <yurii@skymind.io>
* reduce min/max/norm_max bp
Signed-off-by: raver119 <raver119@gmail.com>
* lup implementation. Additional enhancements.
* provide code for upsamling2d/3d backprop
Signed-off-by: Yurii <yurii@skymind.io>
* weightedCrossEntropyWithLogits
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed template math atomicMul for 64bit ints.
* Refactored dynamic_partition_bp op.
* inverseBroadcast fix
Signed-off-by: raver119 <raver119@gmail.com>
* DynamicPartitionBP test datatype fixed.
* - nd4j_atomicMul Windows fix
- cpu/NDArrayLambda.hpp excluded from CUDA
Signed-off-by: raver119 <raver119@gmail.com>
2019-06-27 17:37:04 +02:00
void scatterSimple ( nd4j : : LaunchContext * context , const int opId , NDArray & input , const NDArray & updates , const NDArray & indices , const std : : vector < int > & dimensions ) {
2019-06-06 14:21:15 +02:00
// updates and indices have same length
const Nd4jLong len = indices . lengthOf ( ) ;
switch ( opId ) {
case 6 : { // copy
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
auto inSubArr = input ( i , dimensions ) ;
inSubArr . p ( indices . t < Nd4jLong > ( i ) , updates . e ( i ) ) ;
}
} ;
samediff : : Threads : : parallel_for ( func , 0 , len ) ;
2019-06-06 14:21:15 +02:00
}
break ;
default :
throw std : : invalid_argument ( " helpers::scatterSimple: operation is not implemented for given id ! " ) ;
}
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void mergeMaxIndex_ ( const std : : vector < NDArray * > & inArrs , NDArray & output ) {
const Nd4jLong numArgs = inArrs . size ( ) ;
auto x = inArrs [ 0 ] ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto e = start ; e < stop ; e + = increment ) {
T max = - DataTypeUtils : : max < T > ( ) ;
Nd4jLong idx = 0 ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
for ( int i = 0 ; i < numArgs ; i + + ) {
T v = inArrs [ i ] - > e < T > ( e ) ;
if ( v > max ) {
max = v ;
idx = i ;
}
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
output . p ( e , idx ) ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
samediff : : Threads : : parallel_for ( func , 0 , x - > lengthOf ( ) ) ;
}
void mergeMaxIndex ( nd4j : : LaunchContext * context , const std : : vector < NDArray * > & inArrs , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( inArrs [ 0 ] - > dataType ( ) , mergeMaxIndex_ , ( inArrs , output ) , LIBND4J_TYPES ) ;
2019-06-06 14:21:15 +02:00
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void mergeMax_ ( const std : : vector < NDArray * > & inArrs , NDArray & output ) {
const Nd4jLong numArgs = inArrs . size ( ) ;
auto x = inArrs [ 0 ] ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto e = start ; e < stop ; e + = increment ) {
T max = - DataTypeUtils : : max < T > ( ) ;
for ( int i = 0 ; i < numArgs ; i + + ) {
T v = inArrs [ i ] - > e < T > ( e ) ;
if ( v > max )
max = v ;
}
output . p ( e , max ) ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
samediff : : Threads : : parallel_for ( func , 0 , x - > lengthOf ( ) ) ;
}
void mergeMax ( nd4j : : LaunchContext * context , const std : : vector < NDArray * > & inArrs , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( output . dataType ( ) , mergeMax_ , ( inArrs , output ) , LIBND4J_TYPES ) ;
2019-06-06 14:21:15 +02:00
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void mergeAvg_ ( const std : : vector < NDArray * > & inArrs , NDArray & output ) {
const Nd4jLong numArgs = inArrs . size ( ) ;
const T factor = 1.f / numArgs ;
auto x = inArrs [ 0 ] ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto e = start ; e < stop ; e + = increment ) {
T sum = 0. ;
for ( int i = 0 ; i < numArgs ; i + + ) {
T v = inArrs [ i ] - > e < T > ( e ) ;
sum + = v ;
}
output . p < T > ( e , sum * factor ) ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
samediff : : Threads : : parallel_for ( func , 0 , x - > lengthOf ( ) ) ;
}
void mergeAvg ( nd4j : : LaunchContext * context , const std : : vector < NDArray * > & inArrs , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( output . dataType ( ) , mergeAvg_ , ( inArrs , output ) , LIBND4J_TYPES ) ;
2019-06-06 14:21:15 +02:00
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void mergeAdd_ ( const std : : vector < NDArray * > & inArrs , NDArray & output ) {
const Nd4jLong numArgs = inArrs . size ( ) ;
auto x = inArrs [ 0 ] ;
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto e = start ; e < stop ; e + = increment ) {
T sum = ( T ) 0.f ;
for ( int i = 0 ; i < numArgs ; i + + )
sum + = inArrs [ i ] - > e < T > ( e ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
output . p ( e , sum ) ;
}
} ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
samediff : : Threads : : parallel_for ( func , 0 , x - > lengthOf ( ) ) ;
2019-06-06 14:21:15 +02:00
}
void mergeAdd ( nd4j : : LaunchContext * context , const std : : vector < NDArray * > & inArrs , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( output . dataType ( ) , mergeAdd_ , ( inArrs , output ) , LIBND4J_TYPES ) ;
}
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void clipByNorm_ ( NDArray & input , NDArray & output , const std : : vector < int > & dimensions , const NDArray & clipNorm , const bool isInplace ) {
const int rank = input . rankOf ( ) ;
2019-08-02 19:01:03 +02:00
const auto norm2 = input . reduceAlongDims ( reduce : : Norm2 , dimensions ) ;
const T normActual = norm2 . e < T > ( 0 ) ;
const T normClip = clipNorm . e < T > ( 0 ) ;
2019-06-06 14:21:15 +02:00
if ( isInplace ) {
2019-08-02 19:01:03 +02:00
2019-06-06 14:21:15 +02:00
if ( norm2 . lengthOf ( ) = = 1 ) {
2019-08-02 19:01:03 +02:00
if ( normActual > normClip )
input * = ( normClip / normActual ) ;
2019-06-06 14:21:15 +02:00
}
else {
2019-08-02 19:01:03 +02:00
auto listOfInSubArrs = input . allTensorsAlongDimension ( dimensions ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
const T iNormActual = norm2 . e < T > ( i ) ;
if ( iNormActual > normClip )
* listOfInSubArrs - > at ( i ) * = normClip / iNormActual ;
}
} ;
samediff : : Threads : : parallel_tad ( func , 0 , listOfInSubArrs - > size ( ) ) ;
2019-08-02 19:01:03 +02:00
delete listOfInSubArrs ;
2019-06-06 14:21:15 +02:00
}
}
else {
if ( norm2 . lengthOf ( ) = = 1 ) {
2019-08-02 19:01:03 +02:00
if ( normActual > normClip )
output . assign ( input * ( normClip / normActual ) ) ;
2019-06-06 14:21:15 +02:00
else
2019-08-02 19:01:03 +02:00
output . assign ( input ) ;
2019-06-06 14:21:15 +02:00
}
else {
2019-08-02 19:01:03 +02:00
auto listOfInSubArrs = input . allTensorsAlongDimension ( dimensions ) ;
auto listOfOutSubArrs = output . allTensorsAlongDimension ( dimensions ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
auto inputSubArr = listOfInSubArrs - > at ( i ) ;
auto outputSubArr = listOfOutSubArrs - > at ( i ) ;
outputSubArr - > assign ( inputSubArr ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
const T iNormActual = norm2 . e < T > ( i ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( iNormActual > clipNorm . e < T > ( 0 ) )
* outputSubArr * = clipNorm / iNormActual ;
}
} ;
samediff : : Threads : : parallel_tad ( func , 0 , listOfInSubArrs - > size ( ) ) ;
2019-08-02 19:01:03 +02:00
delete listOfInSubArrs ;
delete listOfOutSubArrs ;
2019-06-06 14:21:15 +02:00
}
}
}
2019-08-02 19:01:03 +02:00
//////////////////////////////////////////////////////////////////////////
void clipByNorm ( nd4j : : LaunchContext * context , NDArray & input , NDArray & output , const std : : vector < int > & dimensions , const NDArray & clipNorm , const bool isInplace ) {
BUILD_SINGLE_SELECTOR ( output . dataType ( ) , clipByNorm_ , ( input , output , dimensions , clipNorm , isInplace ) , FLOAT_TYPES ) ;
}
2019-06-06 14:21:15 +02:00
template < typename T >
static void clipByGlobalNorm_ ( std : : vector < NDArray * > const & inputs , double clipNorm , nd4j : : memory : : Workspace * workspace , std : : vector < NDArray * > & outputs , bool isInplace ) {
2019-09-09 15:27:45 +02:00
T globalNorm = 0 ; //NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
// PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm)
2019-08-07 14:29:17 +02:00
for ( size_t i = 0 ; i < inputs . size ( ) ; i + + ) {
auto input = inputs [ i ] ;
2019-06-06 14:21:15 +02:00
auto l2norm = input - > reduceNumber ( reduce : : Norm2 ) ;
2019-09-09 15:27:45 +02:00
globalNorm + = l2norm . t < T > ( 0 ) * l2norm . t < T > ( 0 ) ;
2019-06-06 14:21:15 +02:00
}
2019-09-09 15:27:45 +02:00
//globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
auto normS = nd4j : : math : : nd4j_sqrt < T , T > ( globalNorm ) ;
outputs [ inputs . size ( ) ] - > p ( 0 , normS ) ;
2019-06-06 14:21:15 +02:00
2019-09-09 15:27:45 +02:00
const T factor = clipNorm / normS ;
2019-06-06 14:21:15 +02:00
2019-09-09 15:27:45 +02:00
// PRAGMA_OMP_PARALLEL_FOR
2019-06-06 14:21:15 +02:00
for ( size_t e = 0 ; e < inputs . size ( ) ; e + + ) {
// all-reduce
auto input = inputs [ e ] ;
auto output = outputs [ e ] ;
2019-09-09 15:27:45 +02:00
if ( normS < = clipNorm ) {
2019-06-06 14:21:15 +02:00
output - > assign ( input ) ;
}
else {
auto lambda = LAMBDA_T ( _x , factor ) { return _x * factor ; } ;
input - > applyLambda < T > ( lambda , output ) ;
}
}
}
void clipByGlobalNorm ( nd4j : : LaunchContext * context , std : : vector < NDArray * > const & inputs , double clipNorm , nd4j : : memory : : Workspace * workspace , std : : vector < NDArray * > & outputs , bool isInplace ) {
BUILD_SINGLE_SELECTOR ( outputs [ 0 ] - > dataType ( ) , clipByGlobalNorm_ , ( inputs , clipNorm , workspace , outputs , isInplace ) , FLOAT_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void clipByGlobalNorm_ , ( std : : vector < NDArray * > const & inputs , double clipNorm , nd4j : : memory : : Workspace * workspace , std : : vector < NDArray * > & outputs , bool isInplace ) , FLOAT_TYPES ) ;
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void clipByNormBP_ ( const NDArray & input , const NDArray & gradO , NDArray & gradI /*output*/ , const std : : vector < int > & dimensions , const NDArray & clipNorm ) {
const int rank = input . rankOf ( ) ;
auto norm2 = input . reduceAlongDims ( reduce : : Norm2 , dimensions ) ;
if ( norm2 . lengthOf ( ) = = 1 ) {
const T N = norm2 . e < T > ( 0 ) ;
auto cn = clipNorm . e < T > ( 0 ) ;
if ( N > cn ) {
const T sumOfProd = ( input * gradO ) . reduceNumber ( reduce : : Sum ) . e < T > ( 0 ) ; // reduce to scalar
const T factor1 = static_cast < T > ( 1.f ) / N ;
2019-08-02 19:01:03 +02:00
const T factor3 = factor1 / ( N * N ) ; // 1 / (N*N*N)
2019-06-06 14:21:15 +02:00
auto lambda = LAMBDA_TT ( elem1 , elem2 , cn , sumOfProd , factor1 , factor3 ) {
return cn * ( factor1 * elem2 - factor3 * elem1 * sumOfProd ) ;
} ;
( const_cast < NDArray & > ( input ) ) . applyPairwiseLambda < T > ( const_cast < NDArray * > ( & gradO ) , lambda , & gradI ) ;
}
else
gradI . assign ( gradO ) ;
}
else {
2019-08-02 19:01:03 +02:00
const auto gradISubArrs = gradI . allTensorsAlongDimension ( { dimensions } ) ;
const auto gradOSubArrs = gradO . allTensorsAlongDimension ( { dimensions } ) ;
const auto inputSubArrs = input . allTensorsAlongDimension ( { dimensions } ) ;
2019-06-06 14:21:15 +02:00
2019-08-02 19:01:03 +02:00
auto cn = clipNorm . e < T > ( 0 ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
for ( auto i = start ; i < stop ; i + = increment ) {
T N = norm2 . e < T > ( i ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto gradOSubArr = gradOSubArrs - > at ( i ) ;
auto gradISubArr = gradISubArrs - > at ( i ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( N > cn ) {
auto inputSubArr = inputSubArrs - > at ( i ) ;
const T sumOfProd = ( * inputSubArr * * gradOSubArr ) . reduceNumber ( reduce : : Sum ) . e < T > ( 0 ) ; // reduce to scalar
const T factor1 = static_cast < T > ( 1.f ) / N ;
const T factor3 = factor1 / ( N * N ) ; // 1 / (N*N*N)
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto lambda = LAMBDA_TT ( elem1 , elem2 , cn , sumOfProd , factor1 , factor3 ) {
return cn * ( factor1 * elem2 - factor3 * elem1 * sumOfProd ) ;
} ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
inputSubArr - > applyPairwiseLambda < T > ( gradOSubArr , lambda , gradISubArr ) ;
} else
gradISubArr - > assign ( gradOSubArr ) ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
samediff : : Threads : : parallel_tad ( func , 0 , gradISubArrs - > size ( ) ) ;
2019-08-02 19:01:03 +02:00
delete gradISubArrs ;
delete gradOSubArrs ;
delete inputSubArrs ;
2019-06-06 14:21:15 +02:00
}
}
void clipByNormBP ( nd4j : : LaunchContext * context , const NDArray & input , const NDArray & gradO , NDArray & gradI /*output*/ , const std : : vector < int > & dimensions , const NDArray & clipNorm ) {
BUILD_SINGLE_SELECTOR ( gradI . dataType ( ) , clipByNormBP_ , ( input , gradO , gradI , dimensions , clipNorm ) , FLOAT_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void clipByNormBP_ , ( const NDArray & input , const NDArray & gradO , NDArray & gradI /*output*/ , const std : : vector < int > & dimensions , const NDArray & clipNorm ) , FLOAT_TYPES ) ;
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void clipByAveraged_ ( NDArray & input , NDArray & output , const std : : vector < int > & dimensions , const NDArray & clipNorm , const bool isInplace ) {
auto cn = clipNorm . e < T > ( 0 ) ;
if ( dimensions . size ( ) = = 0 ) {
// all-reduce
T n2 = input . reduceNumber ( reduce : : Norm2 ) . e < T > ( 0 ) / input . lengthOf ( ) ;
if ( n2 < = cn ) {
if ( ! isInplace )
output . assign ( input ) ;
}
else {
const T factor = cn / n2 ;
auto lambda = LAMBDA_T ( _x , factor ) { return _x * factor ; } ;
input . applyLambda < T > ( lambda , & output ) ;
}
}
else {
// along dimension
auto norm2 = input . reduceAlongDims ( reduce : : Norm2 , dimensions , false ) ;
if ( ! isInplace )
output . assign ( input ) ;
auto tads = output . allTensorsAlongDimension ( dimensions ) ;
// TODO: make this CUDA-compliant somehow
for ( int e = 0 ; e < tads - > size ( ) ; e + + ) {
T n2 = norm2 . e < T > ( e ) / tads - > at ( e ) - > lengthOf ( ) ;
const T factor = cn / n2 ;
if ( n2 > cn ) {
auto lambda = LAMBDA_T ( _x , factor ) { return _x * factor ; } ;
tads - > at ( e ) - > applyLambda < T > ( lambda , & output ) ;
}
}
delete tads ;
}
}
void clipByAveraged ( nd4j : : LaunchContext * context , NDArray & input , NDArray & output , const std : : vector < int > & dimensions , const NDArray & clipNorm , const bool isInplace ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , clipByAveraged_ , ( input , output , dimensions , clipNorm , isInplace ) , FLOAT_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void clipByAveraged_ , ( NDArray & input , NDArray & output , const std : : vector < int > & dimensions , const NDArray & clipNorm , const bool isInplace ) , FLOAT_TYPES ) ;
/*
if ( d1 > params [ 1 ] )
return params [ 1 ] ;
else if ( d1 < params [ 0 ] )
return params [ 0 ] ;
else return d1 ;
*/
template < typename T >
static void clipByValue_ ( NDArray & input , double leftBound , double rightBound , NDArray & output ) {
auto routine = LAMBDA_T ( _x , leftBound , rightBound ) {
if ( _x > rightBound ) return rightBound ;
if ( _x < leftBound ) return leftBound ;
return _x ;
} ;
input . applyLambda < T > ( routine , & output ) ;
}
void clipByValue ( nd4j : : LaunchContext * context , NDArray & input , double leftBound , double rightBound , NDArray & output ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , clipByValue_ , ( input , leftBound , rightBound , output ) , FLOAT_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void clipByValue_ , ( NDArray & input , double leftBound , double rightBound , NDArray & output ) ; , FLOAT_TYPES ) ;
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void mirrorPad_ ( const NDArray & input , const NDArray & paddings , NDArray & output , const int mode ) {
// mode: 0 - REFLECT, else - SYMMETRIC
const int reflBorder = ( bool ) mode ? 1 : 0 ;
const int rank = input . rankOf ( ) ;
const Nd4jLong outLen = output . lengthOf ( ) ;
if ( rank < = 1 ) {
const Nd4jLong inLen = input . lengthOf ( ) ;
const auto leftSide = paddings . e < Nd4jLong > ( 0 ) ;
const auto leftSideCorrected = leftSide - reflBorder ;
const Nd4jLong len = 2 * ( inLen - 1 ) + leftSide + reflBorder ;
for ( int i = 0 ; i < outLen ; + + i ) {
if ( i < leftSide ) // left side
output . p ( i , input . e < T > ( leftSideCorrected - i ) ) ;
else if ( i > = leftSide & & i < leftSide + inLen ) // middle
output . p ( i , input . e < T > ( i - leftSide ) ) ;
else // right side
output . p ( i , input . e < T > ( len - i ) ) ;
}
}
else {
2019-11-13 15:15:18 +01:00
auto func = PRAGMA_THREADS_FOR {
Nd4jLong inIdx [ MAX_RANK ] ;
Nd4jLong outIdx [ MAX_RANK ] ;
for ( auto i = start ; i < stop ; i + = increment ) {
shape : : index2coords ( i , output . getShapeInfo ( ) , outIdx ) ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
for ( int j = 0 ; j < rank ; + + j ) {
const Nd4jLong inLen = input . sizeAt ( j ) ;
const auto leftSide = paddings . e < T > ( j , 0 ) ;
const auto leftSideCorrected = leftSide - reflBorder ;
const Nd4jLong len = 2 * ( inLen - 1 ) + leftSide + reflBorder ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
if ( outIdx [ j ] < leftSide ) // left side
inIdx [ j ] = leftSideCorrected - outIdx [ j ] ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
else if ( outIdx [ j ] > = leftSide & & outIdx [ j ] < leftSide + inLen ) // middle
inIdx [ j ] = outIdx [ j ] - leftSide ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
else // right side
inIdx [ j ] = len - outIdx [ j ] ;
}
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
auto outOffset = shape : : getOffset ( output . getShapeInfo ( ) , outIdx ) ;
auto inOffset = shape : : getOffset ( input . getShapeInfo ( ) , inIdx ) ;
reinterpret_cast < T * > ( output . buffer ( ) ) [ outOffset ] = reinterpret_cast < T * > ( input . getBuffer ( ) ) [ inOffset ] ;
2019-06-06 14:21:15 +02:00
}
2019-11-13 15:15:18 +01:00
} ;
2019-06-06 14:21:15 +02:00
2019-11-13 15:15:18 +01:00
samediff : : Threads : : parallel_for ( func , 0 , outLen ) ;
2019-06-06 14:21:15 +02:00
}
}
void mirrorPad ( nd4j : : LaunchContext * context , const NDArray & input , const NDArray & paddings , NDArray & output , const int mode ) {
BUILD_SINGLE_SELECTOR ( input . dataType ( ) , mirrorPad_ , ( input , paddings , output , mode ) , LIBND4J_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void mirrorPad_ , ( const NDArray & input , const NDArray & paddings , NDArray & output , const int mode ) , LIBND4J_TYPES ) ;
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void concat_ ( const std : : vector < NDArray * > & inArrs , NDArray & output , const int axis ) {
2019-07-15 15:36:35 +02:00
nd4j : : SpecialMethods < T > : : concatCpuGeneric ( inArrs , output , axis ) ;
2019-06-06 14:21:15 +02:00
}
void concat ( nd4j : : LaunchContext * context , const std : : vector < NDArray * > & inArrs , NDArray & output , const int axis ) {
BUILD_SINGLE_SELECTOR ( output . dataType ( ) , concat_ , ( inArrs , output , axis ) , LIBND4J_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void concat_ , ( const std : : vector < NDArray * > & inArrs , NDArray & output , const int axis ) , LIBND4J_TYPES ) ;
//////////////////////////////////////////////////////////////////////////
template < typename T >
static void tileBP_ ( const NDArray & gradO /*input*/ , NDArray & gradI /*output*/ , const std : : vector < Nd4jLong > reps ) {
T * gradIBuff = reinterpret_cast < T * > ( gradI . getBuffer ( ) ) ;
const T * gradOBuff = reinterpret_cast < T * > ( gradO . getBuffer ( ) ) ;
const Nd4jLong gradILen = gradI . lengthOf ( ) ;
const Nd4jLong gradOLen = gradO . lengthOf ( ) ; // gradOLen >= gradILen
const Nd4jLong gradIEWS = nd4j : : math : : nd4j_abs < Nd4jLong > ( gradI . ews ( ) ) ;
const Nd4jLong gradOEWS = gradO . ews ( ) ;
// initial zeroing of gradI content
if ( gradIEWS = = 1 )
memset ( gradIBuff , 0 , gradILen * sizeof ( T ) ) ;
else {
//PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( int i = 0 ; i < gradILen * gradIEWS ; i + = gradIEWS )
gradIBuff [ i ] = static_cast < T > ( 0.f ) ;
}
if ( gradO . ordering ( ) = = ' c ' & & gradOEWS = = 1 ) {
//PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( Nd4jLong i = 0 ; i < gradOLen ; + + i ) {
auto idx = shape : : subArrayIndex ( i , gradO . getShapeInfo ( ) , gradI . getShapeInfo ( ) ) ;
gradI . p ( idx , gradI . e < T > ( idx ) + gradOBuff [ i ] ) ;
}
}
else if ( gradO . ordering ( ) = = ' c ' & & gradOEWS > 1 ) {
//PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( Nd4jLong i = 0 ; i < gradOLen ; + + i ) {
auto idx = shape : : subArrayIndex ( i , gradO . getShapeInfo ( ) , gradI . getShapeInfo ( ) ) ;
gradI . p ( idx , gradI . e < T > ( idx ) + gradOBuff [ i * gradOEWS ] ) ;
}
}
else {
//PRAGMA_OMP_PARALLEL_FOR_SIMD
for ( Nd4jLong i = 0 ; i < gradOLen ; + + i ) {
auto fidx = shape : : subArrayIndex ( i , gradO . getShapeInfo ( ) , gradI . getShapeInfo ( ) ) ;
2019-09-11 19:12:09 +02:00
gradI . p ( fidx , gradI . e < T > ( fidx ) + gradOBuff [ shape : : getIndexOffset ( i , gradO . getShapeInfo ( ) ) ] ) ;
2019-06-06 14:21:15 +02:00
}
}
}
2019-08-21 20:10:29 +02:00
void tileBP ( nd4j : : LaunchContext * context , const NDArray & gradO /*input*/ , NDArray & gradI /*output*/ , const std : : vector < Nd4jLong > reps ) {
BUILD_SINGLE_SELECTOR ( gradI . dataType ( ) , tileBP_ , ( gradO , gradI , reps ) , FLOAT_TYPES ) ;
}
BUILD_SINGLE_TEMPLATE ( template void tileBP_ , ( const NDArray & gradO /*input*/ , NDArray & gradI /*output*/ , const std : : vector < Nd4jLong > reps ) , FLOAT_TYPES ) ;
2019-06-06 14:21:15 +02:00
}
}
}