/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
//  @author raver119@gmail.com
//

#ifndef LIBND4J_HEADERS_RECURRENT_H
#define LIBND4J_HEADERS_RECURRENT_H

#include <ops/declarable/headers/common.h>

namespace nd4j {
namespace ops  { 

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for Simple Recurrent Unit: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
       *
       * Input arrays:
       *    0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
       *    1: 2d tensor of weights [3K x K]
       *    2: row of biases with twice length [1 x 2K]
       *    3: 2d tensor of previous cell state [bS x K]
       *    4: optional, 2d tensor of dropout mask [bS x K]
       *
       * Output arrays:
       *    0: 3d tensor of cell output [bS x K x N]
       *    1: 3d tensor of cell state [bS x K x N]
       */
        #if NOT_EXCLUDED(OP_sru)
        DECLARE_CUSTOM_OP(sru,   5, 2, false, 0, 0);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
       * 
       * Input arrays: 
       *    0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
       *    1: 2d tensor of weights [2K x 6K]
       *    2: row of biases with twice length [1 x 4K]
       *    3: 2d tensor of previous cell state [bS x 2K]
       *    4: optional, 2d tensor of dropout mask [bS x 2K]
       *  
       * Output arrays: 
       *    0: 3d tensor of cell output [N x bS x 2K]
       *    1: 3d tensor of cell state [N x bS x 2K]
       */
        #if NOT_EXCLUDED(OP_sru_bi)
        DECLARE_CUSTOM_OP(sru_bi,      5, 2, true,  0, 0);
        #endif


    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for back propagation in Simple Recurrent Unit: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
       * 
       * Input arrays: 
       *    0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
       *    1: 2d tensor of weights [3K x K]
       *    2: row of biases with twice length [1 x 2K]
       *    3: 2d tensor of previous cell state [bS x K]
       *    4: 3d tensor of cell state [bS x K x N]
       *    5: 2d tensor of cell state gradients [bS x K]
       *    6: 3d tensor of state output gradients [bS x K x N]
       *    7: optional, 2d tensor of dropout mask [bS x K]
       *  
       * Output arrays: 
       *    0: 3d tensor of input gradients [bS x K x N]
       *    1: 3d tensor of weights gradients [bS x 3K x K]
       *    2: 2d, row of biases gradients [1 x 2K]
       *    3: 2d, tensor of state gradients [bS x K]
       */
        #if NOT_EXCLUDED(OP_sru)
        DECLARE_CUSTOM_OP(sru_bp,      8, 4, true,  0, 0);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for back propagation in Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
       * 
       * Input arrays: 
       *    0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
       *    1: 2d tensor of weights [2K x 6K]
       *    2: row of biases with twice length [1 x 4K]
       *    3: 2d tensor of previous cell state [bS x 2K]
       *    4: 3d tensor of cell state [N x bS x 2K]
       *    5: 2d tensor of cell state gradients [bS x 2K]
       *    6: 3d tensor of state output gradients [N x bS x 2K]
       *    7: optional, 2d tensor of dropout mask [bS x 2K]
       *  
       * Output arrays: 
       *    0: 3d tensor of input gradients [N x bS x 2K]
       *    1: 3d tensor of weights gradients [N x 2K x 6K]
       *    2: 2d, row of biases gradients [1 x 4K]
       *    3: 2d, tensor of state gradients [bS x 2K]
       */                  
        #if NOT_EXCLUDED(OP_sru_bi)
        DECLARE_CUSTOM_OP(sru_bi_bp,   8, 4, true,  0, 0);
        #endif


    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for LSTM cell with peep hole connections:
       *    S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
       *    and 
       *    https://research.google.com/pubs/archive/43905.pdf
       *    Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014. 
       *
       * Input arrays: 
       *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
       *    1: previous cell output [batchSize x numProj],  that is at previous time step t-1, in case of projection=false -> numProj=numUnits!!! 
       *    2: previous cell state  [batchSize x numUnits], that is at previous time step t-1   
       *    3: input-to-hidden  weights, [inSize  x 4*numUnits] 
       *    4: hidden-to-hidden weights, [numProj x 4*numUnits] 
       *    5: diagonal weights for peephole connections [3*numUnits] 
       *    6: projection weights [numUnits x numProj] 
       *    7: biases, [4*numUnits] 
       * 
       *  Input integer arguments:
       *    0: if not zero, provide peephole connections
       *    1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
       *
       *  Input float arguments:
       *    0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
       *    1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
       *    2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
       *  
       * Output arrays: 
       *    0: current cell output [batchSize x numProj], that is at current time step t
       *    1: current cell state  [batchSize x numUnits], that is at current time step t
       */                  
        #if NOT_EXCLUDED(OP_lstmCell)
        DECLARE_CUSTOM_OP(lstmCell, 8, 2, false, 3, 2);
        #endif


    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for LSTM cell with optional peep hole connections:
       *    S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
       *    and 
       *    https://research.google.com/pubs/archive/43905.pdf
       *    Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014.
	   * See also: https://arxiv.org/pdf/1503.04069.pdf
       *
       * Input arrays: 
       *    0: input [bS, inSize] at time t
       *    1: previous cell state  [bS, numUnits], time t-1
       *    2: previous output [bS, numUnits], time t-1
       *    3: Weights - concatenated (input-to-hidden, hidden-to-hidden weights)  weights, [(inSize+numUnits), 4*numUnits]
       *    4: weights - cell peephole (t-1) connections to input modulation gate, [numUnits]
       *    5: weights - cell peephole (t-1) connections to forget gate, [numUnits]
       *    6: weights - cell peephole (t) connections to output gate, [numUnits]
       *    7: biases, shape [4*numUnits]
       * 
       *  Input integer arguments:
       *    0: if not zero, provide peephole connections
       *
       *  Input float arguments:
       *    0: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
	   *    1: clipping value for cell state, if it is not equal to zero, then cell state is clipped
       *  
       * Output arrays: 
       *    0: i      - Input modulation gate activations [bS, numUnits]
       *    1: c (cs) - Cell state (pre tanh) [bs, numUnits] (cs)
       *    2: f      - Output - forget gate activations [bs, numUnits]
       *    3: o      - Output - output gate activations [bs, numUnits]
       *    4: z (ci) - Output - block input [bs, numUnits]
       *    5: h (co) - Cell state, post tanh [bs, numUnits]
       *    6: y (h)  - Current cell output [bS, numUnits], time t
       */                  
        #if NOT_EXCLUDED(OP_lstmBlockCell)
        DECLARE_CUSTOM_OP(lstmBlockCell, 8, 7, false, 2, 1);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation for LSTM layer with optional peep hole connections.
       * See lstmBlockCell for details. lstmBlockCell is used internally for computation.
       * This method expects as input (and returns as output) sequences in one of 3 formats, depending on the data format arg:
       * dataFormat = 0 -> TNS: shape [timeLength, numExamples, inOutSize] - sometimes referred to as "time major"
       * dataFormat = 1 -> NST: shape [numExamples, inOutSize, timeLength]
       * dataFormat = 2 -> NTS: shape [numExamples, timeLength, inOutSize] - TF "time_major=false" layout
       *
       *
       * Input arrays:
       *    0: max sequence length; long/int64 scalar
       *    1: input [seqLength, bS, inSize] at time t
       *    2: previous/initial cell state  [bS, numUnits]
       *    3: previous/initial output [bS, numUnits]
       *    4: Weights - concatenated (input-to-hidden, hidden-to-hidden weights)  weights, [(inSize+numUnits), 4*numUnits]
       *    5: weights - cell peephole (t-1) connections to input modulation gate, [numUnits]
       *    6: weights - cell peephole (t-1) connections to forget gate, [numUnits]
       *    7: weights - cell peephole (t) connections to output gate, [numUnits]
       *    8: biases, Shape [4*numUnits]
       *
       *  Input integer arguments:
       *    0: if not zero, provide peephole connections
       *    1: Data format - 0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]; 2=NTS=[mb,seqLen,size]
       *
       *  Input float arguments:
       *    0: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
       *    1: clipping value for cell state, if it is not equal to zero, then cell state is clipped
       *
       * Output arrays:
       *    0: i      - Input modulation gate activations, rank 3, shape as per dataFormat
       *    1: c (cs) - Cell state (pre tanh), rank 3, shape as per dataFormat
       *    2: f      - Output - forget gate activations, rank 3, shape as per dataFormat
       *    3: o      - Output - output gate activations, rank 3, shape as per dataFormat
       *    4: z (ci) - Output - block input, rank 3, shape as per dataFormat
       *    5: h (co) - Cell state, post tanh, rank 3, shape as per dataFormat
       *    6: y (h)  - Current cell output, rank 3, shape as per dataFormat
       */
        #if NOT_EXCLUDED(OP_lstmBlock)
        DECLARE_CUSTOM_OP(lstmBlock, 9, 7, false, 2, 2);
        #endif
		
    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operations for Simple Recurrent Unit cell: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
       *
       * Input arrays: 
       *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
       *    1: previous cell state [batchSize x inSize], that is at previous time step t-1
       *    2: weights [inSize x 3*inSize]
       *    3: biases [1 x 2*inSize]
       * 
       * Output arrays: 
       *    0: current cell output [batchSize x inSize], that is at current time step t
       *    1: current cell state  [batchSize x inSize], that is at current time step t
       */                  
        #if NOT_EXCLUDED(OP_sruCell)
        DECLARE_CUSTOM_OP(sruCell, 4, 2, false, 0, 0);
        #endif


    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of gated Recurrent Unit cell:
       *    Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio       
       *    "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
       *
       * Input arrays: 
       *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
       *    1: previous cell output [batchSize x numUnits],  that is at previous time step t-1
       *    2: RU weights - [(nIn+nOut), 2*numUnits] - reset and update gates (input/recurrent weights)
       *    3: C weights - [(nIn+nOut), numUnits] - cell gate (input/recurrent weights)
       *    4: reset and update biases, [2*numUnits] - reset and update gates
       *    5: cell biases, [numUnits]
       *  
       * Output arrays: 
       *    0: Reset gate output [bS, numUnits]
       *    1: Update gate output [bS, numUnits]
       *    2: Cell gate output [bS, numUnits]
       *    3: Current cell output [bS, numUnits]
       */                  
        #if NOT_EXCLUDED(OP_gruCell)
        DECLARE_CUSTOM_OP(gruCell, 6, 4, false, 0, 0);
        #endif

        #if NOT_EXCLUDED(OP_gruCell)
        DECLARE_CUSTOM_OP(gruCell_bp, 6, 5, false, 0, 0);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation "LSTM time sequences" with peep hole connections:
       *
       * Input arrays: 
       *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: initial cell output [batchSize x numProj],  that is at time step = 0, in case of projection=false -> numProj=numUnits!!! 
       *    2: initial cell state  [batchSize x numUnits], that is at time step = 0   
       *    3: input-to-hidden  weights, [inSize  x 4*numUnits] 
       *    4: hidden-to-hidden weights, [numProj x 4*numUnits] 
       *    5: diagonal weights for peephole connections [3*numUnits] 
       *    6: projection weights [numUnits x numProj] 
       *    7: biases, [4*numUnits] 
       * 
       *  Input integer arguments:
       *    0: if not zero, provide peephole connections
       *    1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
       *
       *  Input float arguments:
       *    0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
       *    1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
       *    2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
       *  
       * Output arrays: 
       *    0: cell outputs [time x batchSize x numProj], that is per each time step
       *    1: cell states  [time x batchSize x numUnits], that is per each time step
       */                  
        #if NOT_EXCLUDED(OP_lstm)
        DECLARE_CUSTOM_OP(lstm, 8, 2, false, 3, 2);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of gated Recurrent Unit:
       *
       * Input arrays: 
       *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: initial cell output [batchSize x numUnits],  that is at time step = 0
       *    2: input-to-hidden  weights, [inSize   x 3*numUnits] 
       *    3: hidden-to-hidden weights, [numUnits x 3*numUnits] 
       *    4: biases, [3*numUnits]        
       *  
       * Output arrays: 
       *    0: cell outputs [time x batchSize x numUnits], that is per each time step    
       */                  
        #if NOT_EXCLUDED(OP_gru)
        DECLARE_CUSTOM_OP(gru, 5, 1, false, 0, 0);
        #endif

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation "static RNN time sequences" with peep hole connections:
       *
       * Input arrays:
       *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: input-to-hidden  weights, [inSize   x numUnits]
       *    2: hidden-to-hidden weights, [numUnits x numUnits]
       *    3: biases, [2*numUnits]
       *    4: (optional) initial cell output [batchSize x numUnits], that is at time step = 0
       *    5: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
       *
       * Output arrays:
       *    0: cell outputs [time x batchSize x numUnits]
       *    1: cell final non-zero output [batchSize x numUnits]
       */
        DECLARE_CUSTOM_OP(static_rnn, 4, 2, false, 0, 0);

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation "static RNN time sequences" with peep hole connections:
       *
       * Input arrays:
       *    0: input with shape [time x batchSize x inSize] or [batchSize x time x numUnits], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: input-to-hidden  weights, [inSize   x numUnits]
       *    2: hidden-to-hidden weights, [numUnits x numUnits]
       *    3: biases, [2*numUnits]
       *    4: (optional) initial cell output [batchSize x numUnits], that is at time step = 0
       *    5: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
       *
       *  Input integer arguments:
       *    0: (optional) timeMajor - if non zero then input shape is [time, batchSize, ...], else [batchSize, time, ...]
       *
       * Output arrays:
       *    0: cell outputs [time x batchSize x numUnits] or [batchSize x time x numUnits]
       *    1: cell final non-zero output [batchSize x numUnits]
       */
        DECLARE_CUSTOM_OP(dynamic_rnn, 4, 2, false, 0, 0);

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation "static RNN time sequences" with peep hole connections:
       *
       * Input arrays:
       *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: input-to-hidden  weights for forward RNN, [inSize   x numUnitsFW]
       *    2: hidden-to-hidden weights for forward RNN, [numUnitsFW x numUnitsFW]
       *    3: biases for forward RNN, [2*numUnitsFW]
       *    4: input-to-hidden  weights for backward RNN, [inSize   x numUnitsBW]
       *    5: hidden-to-hidden weights for backward RNN, [numUnitsBW x numUnitsBW]
       *    6: biases for backward RNN, [2*numUnitsBW]
       *    7: (optional) initial cell output for forward RNN [batchSize x numUnitsFW], that is at time step = 0
       *    8: (optional) initial cell output for backward RNN [batchSize x numUnitsBW], that is at time step = 0
       *    9: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
       *
       * Output arrays:
       *    0: cell outputs [time x batchSize x (numUnitsFW + numUnitsBW)]
       *    1: cell final non-zero output for forward RNN  [batchSize x numUnitsFW]
       *    2: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
       */
        DECLARE_CUSTOM_OP(static_bidirectional_rnn, 7, 3, false, 0, 0);

    //////////////////////////////////////////////////////////////////////////
    /**
       * Implementation of operation "static RNN time sequences" with peep hole connections:
       *
       * Input arrays:
       *    0: input with shape [time x batchSize x inSize] or [batchSize x time x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
       *    1: input-to-hidden  weights for forward RNN, [inSize   x numUnitsFW]
       *    2: hidden-to-hidden weights for forward RNN, [numUnitsFW x numUnitsFW]
       *    3: biases for forward RNN, [2*numUnitsFW]
       *    4: input-to-hidden  weights for backward RNN, [inSize   x numUnitsBW]
       *    5: hidden-to-hidden weights for backward RNN, [numUnitsBW x numUnitsBW]
       *    6: biases for backward RNN, [2*numUnitsBW]
       *    7: (optional) initial cell output for forward RNN [batchSize x numUnitsFW], that is at time step = 0
       *    8: (optional) initial cell output for backward RNN [batchSize x numUnitsBW], that is at time step = 0
       *    9: (optional) vector with shape [batchSize] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this provides no calculations for time >= maxTimeStep
       *
       *  Input integer arguments:
       *    0: (optional) timeMajor - if non zero then input shape is [time, batchSize, ...], else [batchSize, time, ...]
       *
       * Output arrays:
       *    0: cell outputs for forward  RNN [time x batchSize x numUnitsFW] or [batchSize x time x  numUnitsFW]
       *    1: cell outputs for backward RNN [time x batchSize x numUnitsBW] or [batchSize x time x  numUnitsBW]
       *    2: cell final non-zero output for forward  RNN [batchSize x numUnitsFW]
       *    3: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
       */
        DECLARE_CUSTOM_OP(dynamic_bidirectional_rnn, 7, 4, false, 0, 0);

}
}
#endif