cavis/libnd4j/include/ops/declarable/headers/recurrent.h

421 lines
22 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef LIBND4J_HEADERS_RECURRENT_H
#define LIBND4J_HEADERS_RECURRENT_H
#include <ops/declarable/headers/common.h>
namespace nd4j {
namespace ops {
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for Simple Recurrent Unit: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
*
* Input arrays:
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
* 1: 2d tensor of weights [3K x K]
* 2: row of biases with twice length [1 × 2K]
* 3: 2d tensor of previous cell state [bS x K]
* 4: optional, 2d tensor of dropout mask [bS x K]
*
* Output arrays:
* 0: 3d tensor of cell output [bS x K x N]
* 1: 3d tensor of cell state [bS x K x N]
*/
#if NOT_EXCLUDED(OP_sru)
DECLARE_CUSTOM_OP(sru, 5, 2, false, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
*
* Input arrays:
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
* 1: 2d tensor of weights [2K x 6K]
* 2: row of biases with twice length [1 × 4K]
* 3: 2d tensor of previous cell state [bS x 2K]
* 4: optional, 2d tensor of dropout mask [bS x 2K]
*
* Output arrays:
* 0: 3d tensor of cell output [N x bS x 2K]
* 1: 3d tensor of cell state [N x bS x 2K]
*/
#if NOT_EXCLUDED(OP_sru_bi)
DECLARE_CUSTOM_OP(sru_bi, 5, 2, true, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for back propagation in Simple Recurrent Unit: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
*
* Input arrays:
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
* 1: 2d tensor of weights [3K x K]
* 2: row of biases with twice length [1 × 2K]
* 3: 2d tensor of previous cell state [bS x K]
* 4: 3d tensor of cell state [bS x K x N]
* 5: 2d tensor of cell state gradients [bS x K]
* 6: 3d tensor of state output gradients [bS x K x N]
* 7: optional, 2d tensor of dropout mask [bS x K]
*
* Output arrays:
* 0: 3d tensor of input gradients [bS x K x N]
* 1: 3d tensor of weights gradients [bS x 3K x K]
* 2: 2d, row of biases gradients [1 x 2K]
* 3: 2d, tensor of state gradients [bS x K]
*/
#if NOT_EXCLUDED(OP_sru)
DECLARE_CUSTOM_OP(sru_bp, 8, 4, true, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for back propagation in Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
*
* Input arrays:
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
* 1: 2d tensor of weights [2K x 6K]
* 2: row of biases with twice length [1 × 4K]
* 3: 2d tensor of previous cell state [bS x 2K]
* 4: 3d tensor of cell state [N x bS x 2K]
* 5: 2d tensor of cell state gradients [bS x 2K]
* 6: 3d tensor of state output gradients [N x bS x 2K]
* 7: optional, 2d tensor of dropout mask [bS x 2K]
*
* Output arrays:
* 0: 3d tensor of input gradients [N x bS x 2K]
* 1: 3d tensor of weights gradients [N x 2K x 6K]
* 2: 2d, row of biases gradients [1 x 4K]
* 3: 2d, tensor of state gradients [bS x 2K]
*/
#if NOT_EXCLUDED(OP_sru_bi)
DECLARE_CUSTOM_OP(sru_bi_bp, 8, 4, true, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for LSTM cell with peep hole connections:
* S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
* and
* https://research.google.com/pubs/archive/43905.pdf
* Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014.
*
* Input arrays:
* 0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
* 1: previous cell output [batchSize x numProj], that is at previous time step t-1, in case of projection=false -> numProj=numUnits!!!
* 2: previous cell state [batchSize x numUnits], that is at previous time step t-1
* 3: input-to-hidden weights, [inSize x 4*numUnits]
* 4: hidden-to-hidden weights, [numProj x 4*numUnits]
* 5: diagonal weights for peephole connections [3*numUnits]
* 6: projection weights [numUnits x numProj]
* 7: biases, [4*numUnits]
*
* Input integer arguments:
* 0: if not zero, provide peephole connections
* 1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
*
* Input float arguments:
* 0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
* 1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
* 2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
*
* Output arrays:
* 0: current cell output [batchSize x numProj], that is at current time step t
* 1: current cell state [batchSize x numUnits], that is at current time step t
*/
#if NOT_EXCLUDED(OP_lstmCell)
DECLARE_CUSTOM_OP(lstmCell, 8, 2, false, 3, 2);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for LSTM cell with optional peep hole connections:
* S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
* and
* https://research.google.com/pubs/archive/43905.pdf
* Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014.
* See also: https://arxiv.org/pdf/1503.04069.pdf
*
* Input arrays:
* 0: input [bS, inSize] at time t
* 1: previous cell state [bS, numUnits], time t-1
* 2: previous output [bS, numUnits], time t-1
* 3: Weights - concatenated (input-to-hidden, hidden-to-hidden weights) weights, [(inSize+numUnits), 4*numUnits]
* 4: weights - cell peephole (t-1) connections to input modulation gate, [numUnits]
* 5: weights - cell peephole (t-1) connections to forget gate, [numUnits]
* 6: weights - cell peephole (t) connections to output gate, [numUnits]
* 7: biases, shape [4*numUnits]
*
* Input integer arguments:
* 0: if not zero, provide peephole connections
*
* Input float arguments:
* 0: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
* 1: clipping value for cell state, if it is not equal to zero, then cell state is clipped
*
* Output arrays:
* 0: i - Input modulation gate activations [bS, numUnits]
* 1: c (cs) - Cell state (pre tanh) [bs, numUnits] (cs)
* 2: f - Output - forget gate activations [bs, numUnits]
* 3: o - Output - output gate activations [bs, numUnits]
* 4: z (ci) - Output - block input [bs, numUnits]
* 5: h (co) - Cell state, post tanh [bs, numUnits]
* 6: y (h) - Current cell output [bS, numUnits], time t
*/
#if NOT_EXCLUDED(OP_lstmBlockCell)
DECLARE_CUSTOM_OP(lstmBlockCell, 8, 7, false, 2, 1);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation for LSTM layer with optional peep hole connections.
* See lstmBlockCell for details. lstmBlockCell is used internally for computation.
* This method expects as input (and returns as output) sequences in one of 3 formats, depending on the data format arg:
* dataFormat = 0 -> TNS: shape [timeLength, numExamples, inOutSize] - sometimes referred to as "time major"
* dataFormat = 1 -> NST: shape [numExamples, inOutSize, timeLength]
* dataFormat = 2 -> NTS: shape [numExamples, timeLength, inOutSize] - TF "time_major=false" layout
*
*
* Input arrays:
* 0: max sequence length; long/int64 scalar
* 1: input [seqLength, bS, inSize] at time t
* 2: previous/initial cell state [bS, numUnits]
* 3: previous/initial output [bS, numUnits]
* 4: Weights - concatenated (input-to-hidden, hidden-to-hidden weights) weights, [(inSize+numUnits), 4*numUnits]
* 5: weights - cell peephole (t-1) connections to input modulation gate, [numUnits]
* 6: weights - cell peephole (t-1) connections to forget gate, [numUnits]
* 7: weights - cell peephole (t) connections to output gate, [numUnits]
* 8: biases, Shape [4*numUnits]
*
* Input integer arguments:
* 0: if not zero, provide peephole connections
* 1: Data format - 0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]; 2=NTS=[mb,seqLen,size]
*
* Input float arguments:
* 0: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
* 1: clipping value for cell state, if it is not equal to zero, then cell state is clipped
*
* Output arrays:
* 0: i - Input modulation gate activations, rank 3, shape as per dataFormat
* 1: c (cs) - Cell state (pre tanh), rank 3, shape as per dataFormat
* 2: f - Output - forget gate activations, rank 3, shape as per dataFormat
* 3: o - Output - output gate activations, rank 3, shape as per dataFormat
* 4: z (ci) - Output - block input, rank 3, shape as per dataFormat
* 5: h (co) - Cell state, post tanh, rank 3, shape as per dataFormat
* 6: y (h) - Current cell output, rank 3, shape as per dataFormat
*/
#if NOT_EXCLUDED(OP_lstmBlock)
DECLARE_CUSTOM_OP(lstmBlock, 9, 7, false, 2, 2);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operations for Simple Recurrent Unit cell: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
*
* Input arrays:
* 0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
* 1: previous cell state [batchSize x inSize], that is at previous time step t-1
* 2: weights [inSize x 3*inSize]
* 3: biases [1 × 2*inSize]
*
* Output arrays:
* 0: current cell output [batchSize x inSize], that is at current time step t
* 1: current cell state [batchSize x inSize], that is at current time step t
*/
#if NOT_EXCLUDED(OP_sruCell)
DECLARE_CUSTOM_OP(sruCell, 4, 2, false, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of gated Recurrent Unit cell:
* Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio
* "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
*
* Input arrays:
* 0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
* 1: previous cell output [batchSize x numUnits], that is at previous time step t-1
* 2: RU weights - [(nIn+nOut), 2*numUnits] - reset and update gates (input/recurrent weights)
* 3: C weights - [(nIn+nOut), numUnits] - cell gate (input/recurrent weights)
* 4: reset and update biases, [2*numUnits] - reset and update gates
* 5: cell biases, [numUnits]
*
* Output arrays:
* 0: Reset gate output [bS, numUnits]
* 1: Update gate output [bS, numUnits]
* 2: Cell gate output [bS, numUnits]
* 3: Current cell output [bS, numUnits]
*/
#if NOT_EXCLUDED(OP_gruCell)
DECLARE_CUSTOM_OP(gruCell, 6, 4, false, 0, 0);
#endif
#if NOT_EXCLUDED(OP_gruCell)
DECLARE_CUSTOM_OP(gruCell_bp, 6, 5, false, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation "LSTM time sequences" with peep hole connections:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: initial cell output [batchSize x numProj], that is at time step = 0, in case of projection=false -> numProj=numUnits!!!
* 2: initial cell state [batchSize x numUnits], that is at time step = 0
* 3: input-to-hidden weights, [inSize x 4*numUnits]
* 4: hidden-to-hidden weights, [numProj x 4*numUnits]
* 5: diagonal weights for peephole connections [3*numUnits]
* 6: projection weights [numUnits x numProj]
* 7: biases, [4*numUnits]
*
* Input integer arguments:
* 0: if not zero, provide peephole connections
* 1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
*
* Input float arguments:
* 0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
* 1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
* 2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
*
* Output arrays:
* 0: cell outputs [time x batchSize x numProj], that is per each time step
* 1: cell states [time x batchSize x numUnits], that is per each time step
*/
#if NOT_EXCLUDED(OP_lstm)
DECLARE_CUSTOM_OP(lstm, 8, 2, false, 3, 2);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of gated Recurrent Unit:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: initial cell output [batchSize x numUnits], that is at time step = 0
* 2: input-to-hidden weights, [inSize x 3*numUnits]
* 3: hidden-to-hidden weights, [numUnits x 3*numUnits]
* 4: biases, [3*numUnits]
*
* Output arrays:
* 0: cell outputs [time x batchSize x numUnits], that is per each time step
*/
#if NOT_EXCLUDED(OP_gru)
DECLARE_CUSTOM_OP(gru, 5, 1, false, 0, 0);
#endif
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation "static RNN time sequences" with peep hole connections:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: input-to-hidden weights, [inSize x numUnits]
* 2: hidden-to-hidden weights, [numUnits x numUnits]
* 3: biases, [2*numUnits]
* 4: (optional) initial cell output [batchSize x numUnits], that is at time step = 0
* 5: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
*
* Output arrays:
* 0: cell outputs [time x batchSize x numUnits]
* 1: cell final non-zero output [batchSize x numUnits]
*/
DECLARE_CUSTOM_OP(static_rnn, 4, 2, false, 0, 0);
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation "static RNN time sequences" with peep hole connections:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize] or [batchSize x time x numUnits], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: input-to-hidden weights, [inSize x numUnits]
* 2: hidden-to-hidden weights, [numUnits x numUnits]
* 3: biases, [2*numUnits]
* 4: (optional) initial cell output [batchSize x numUnits], that is at time step = 0
* 5: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
*
* Input integer arguments:
* 0: (optional) timeMajor - if non zero then input shape is [time, batchSize, ...], else [batchSize, time, ...]
*
* Output arrays:
* 0: cell outputs [time x batchSize x numUnits] or [batchSize x time x numUnits]
* 1: cell final non-zero output [batchSize x numUnits]
*/
DECLARE_CUSTOM_OP(dynamic_rnn, 4, 2, false, 0, 0);
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation "static RNN time sequences" with peep hole connections:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: input-to-hidden weights for forward RNN, [inSize x numUnitsFW]
* 2: hidden-to-hidden weights for forward RNN, [numUnitsFW x numUnitsFW]
* 3: biases for forward RNN, [2*numUnitsFW]
* 4: input-to-hidden weights for backward RNN, [inSize x numUnitsBW]
* 5: hidden-to-hidden weights for backward RNN, [numUnitsBW x numUnitsBW]
* 6: biases for backward RNN, [2*numUnitsBW]
* 7: (optional) initial cell output for forward RNN [batchSize x numUnitsFW], that is at time step = 0
* 8: (optional) initial cell output for backward RNN [batchSize x numUnitsBW], that is at time step = 0
* 9: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
*
* Output arrays:
* 0: cell outputs [time x batchSize x (numUnitsFW + numUnitsBW)]
* 1: cell final non-zero output for forward RNN [batchSize x numUnitsFW]
* 2: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
*/
DECLARE_CUSTOM_OP(static_bidirectional_rnn, 7, 3, false, 0, 0);
//////////////////////////////////////////////////////////////////////////
/**
* Implementation of operation "static RNN time sequences" with peep hole connections:
*
* Input arrays:
* 0: input with shape [time x batchSize x inSize] or [batchSize x time x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
* 1: input-to-hidden weights for forward RNN, [inSize x numUnitsFW]
* 2: hidden-to-hidden weights for forward RNN, [numUnitsFW x numUnitsFW]
* 3: biases for forward RNN, [2*numUnitsFW]
* 4: input-to-hidden weights for backward RNN, [inSize x numUnitsBW]
* 5: hidden-to-hidden weights for backward RNN, [numUnitsBW x numUnitsBW]
* 6: biases for backward RNN, [2*numUnitsBW]
* 7: (optional) initial cell output for forward RNN [batchSize x numUnitsFW], that is at time step = 0
* 8: (optional) initial cell output for backward RNN [batchSize x numUnitsBW], that is at time step = 0
* 9: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
*
* Input integer arguments:
* 0: (optional) timeMajor - if non zero then input shape is [time, batchSize, ...], else [batchSize, time, ...]
*
* Output arrays:
* 0: cell outputs for forward RNN [time x batchSize x numUnitsFW] or [batchSize x time x numUnitsFW]
* 1: cell outputs for backward RNN [time x batchSize x numUnitsBW] or [batchSize x time x numUnitsBW]
* 2: cell final non-zero output for forward RNN [batchSize x numUnitsFW]
* 3: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
*/
DECLARE_CUSTOM_OP(dynamic_bidirectional_rnn, 7, 4, false, 0, 0);
}
}
#endif