cavis/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/params/GravesLSTMParamInitializer.java

/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.deeplearning4j.nn.params;

import java.util.*;
import lombok.val;
import org.deeplearning4j.nn.api.AbstractParamInitializer;
import org.deeplearning4j.nn.conf.layers.LayerConfiguration;
import org.deeplearning4j.nn.weights.IWeightInit;
import org.deeplearning4j.nn.weights.WeightInitUtil;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.INDArrayIndex;
import org.nd4j.linalg.indexing.NDArrayIndex;

public class GravesLSTMParamInitializer extends AbstractParamInitializer {

    private static final GravesLSTMParamInitializer INSTANCE = new GravesLSTMParamInitializer();

    public static GravesLSTMParamInitializer getInstance() {
        return INSTANCE;
    }

    /** Weights for previous time step -> current time step connections */
    public final static String RECURRENT_WEIGHT_KEY = LSTMParamInitializer.RECURRENT_WEIGHT_KEY;
    public final static String BIAS_KEY = LSTMParamInitializer.BIAS_KEY;
    public final static String INPUT_WEIGHT_KEY = LSTMParamInitializer.INPUT_WEIGHT_KEY;

    @Override
    public long numParams(LayerConfiguration l) {
        org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf = (org.deeplearning4j.nn.conf.layers.GravesLSTM) l;

        val nL = layerConf.getNOut(); //i.e., n neurons in this layer
        val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer

        val nParams = nLast * (4 * nL) //"input" weights
                        + nL * (4 * nL + 3) //recurrent weights
                        + 4 * nL; //bias

        return nParams;
    }

    @Override
    public List<String> paramKeys(LayerConfiguration layer) {
        return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY, BIAS_KEY);
    }

    @Override
    public List<String> weightKeys(LayerConfiguration layer) {
        return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY);
    }

    @Override
    public List<String> biasKeys(LayerConfiguration layer) {
        return Collections.singletonList(BIAS_KEY);
    }

    @Override
    public boolean isWeightParam(LayerConfiguration layer, String key) {
        return RECURRENT_WEIGHT_KEY.equals(key) || INPUT_WEIGHT_KEY.equals(key);
    }

    @Override
    public boolean isBiasParam(LayerConfiguration layer, String key) {
        return BIAS_KEY.equals(key);
    }

    @Override
    public Map<String, INDArray> init(LayerConfiguration conf, INDArray paramsView, boolean initializeParams) {
        Map<String, INDArray> params = Collections.synchronizedMap(new LinkedHashMap<String, INDArray>());
        org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =
                        (org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;
        double forgetGateInit = layerConf.getForgetGateBiasInit();

        val nL = layerConf.getNOut(); //i.e., n neurons in this layer
        val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer

        conf.addVariable(INPUT_WEIGHT_KEY);
        conf.addVariable(RECURRENT_WEIGHT_KEY);
        conf.addVariable(BIAS_KEY);

        val length = numParams(conf);
        if (paramsView.length() != length)
            throw new IllegalStateException(
                            "Expected params view of length " + length + ", got length " + paramsView.length());

        val nParamsIn = nLast * (4 * nL);
        val nParamsRecurrent = nL * (4 * nL + 3);
        val nBias = 4 * nL;
        INDArray inputWeightView = paramsView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn));
        INDArray recurrentWeightView = paramsView.get(NDArrayIndex.interval(0,0,true),
                        NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent));
        INDArray biasView = paramsView.get(NDArrayIndex.interval(0,0,true),
                        NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias));

        if (initializeParams) {
            val fanIn = nL;
            val fanOut = nLast + nL;
            val inputWShape = new long[] {nLast, 4 * nL};
            val recurrentWShape = new long[] {nL, 4 * nL + 3};

            IWeightInit rwInit;
            if(layerConf.getWeightInitRecurrent() != null){
                rwInit = layerConf.getWeightInitRecurrent();
            } else {
                rwInit = layerConf.getWeightInit();
            }

            params.put(INPUT_WEIGHT_KEY,layerConf.getWeightInit().init(fanIn, fanOut, inputWShape,
                            IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, inputWeightView));
            params.put(RECURRENT_WEIGHT_KEY, rwInit.init(fanIn, fanOut, recurrentWShape,
                            IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, recurrentWeightView));
            biasView.put(new INDArrayIndex[] {NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nL, 2 * nL)},
                            Nd4j.valueArrayOf(new long[]{1, nL}, forgetGateInit)); //Order: input, forget, output, input modulation, i.e., IFOG}
            /*The above line initializes the forget gate biases to specified value.
             * See Sutskever PhD thesis, pg19:
             * "it is important for [the forget gate activations] to be approximately 1 at the early stages of learning,
             *  which is accomplished by initializing [the forget gate biases] to a large value (such as 5). If it is
             *  not done, it will be harder to learn long range dependencies because the smaller values of the forget
             *  gates will create a vanishing gradients problem."
             *  http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf
             */
            params.put(BIAS_KEY, biasView);
        } else {
            params.put(INPUT_WEIGHT_KEY, WeightInitUtil.reshapeWeights(new long[] {nLast, 4 * nL}, inputWeightView));
            params.put(RECURRENT_WEIGHT_KEY,
                            WeightInitUtil.reshapeWeights(new long[] {nL, 4 * nL + 3}, recurrentWeightView));
            params.put(BIAS_KEY, biasView);
        }

        return params;
    }

    @Override
    public Map<String, INDArray> getGradientsFromFlattened(LayerConfiguration conf, INDArray gradientView) {
        org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =
                        (org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;

        val nL = layerConf.getNOut(); //i.e., n neurons in this layer
        val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer

        val length = numParams(conf);
        if (gradientView.length() != length)
            throw new IllegalStateException(
                            "Expected gradient view of length " + length + ", got length " + gradientView.length());

        val nParamsIn = nLast * (4 * nL);
        val nParamsRecurrent = nL * (4 * nL + 3);
        val nBias = 4 * nL;
        INDArray inputWeightGradView = gradientView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn))
                        .reshape('f', nLast, 4 * nL);
        INDArray recurrentWeightGradView = gradientView
                        .get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent))
                        .reshape('f', nL, 4 * nL + 3);
        INDArray biasGradView = gradientView.get(NDArrayIndex.interval(0,0,true),
                        NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias)); //already a row vector

        Map<String, INDArray> out = new LinkedHashMap<>();
        out.put(INPUT_WEIGHT_KEY, inputWeightGradView);
        out.put(RECURRENT_WEIGHT_KEY, recurrentWeightGradView);
        out.put(BIAS_KEY, biasGradView);

        return out;
    }
}
Dev commits 2021-02-01 14:31:20 +09:00			`/*`
			`* ******************************************************************************`
			`* *`
			`* *`
			`* * This program and the accompanying materials are made available under the`
			`* * terms of the Apache License, Version 2.0 which is available at`
			`* * https://www.apache.org/licenses/LICENSE-2.0.`
			`* *`
Update LICENSE 2021-02-01 17:47:29 +09:00			`* * See the NOTICE file distributed with this work for additional`
			`* * information regarding copyright ownership.`
Dev commits 2021-02-01 14:31:20 +09:00			`* * Unless required by applicable law or agreed to in writing, software`
			`* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* * License for the specific language governing permissions and limitations`
			`* * under the License.`
			`* *`
			`* * SPDX-License-Identifier: Apache-2.0`
			`* *****************************************************************************`
			`*/`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00
			`package org.deeplearning4j.nn.params;`

Fixing Tests 2023-05-08 09:34:44 +02:00			`import java.util.*;`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`import lombok.val;`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`import org.deeplearning4j.nn.api.AbstractParamInitializer;`
			`import org.deeplearning4j.nn.conf.layers.LayerConfiguration;`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`import org.deeplearning4j.nn.weights.IWeightInit;`
			`import org.deeplearning4j.nn.weights.WeightInitUtil;`
			`import org.nd4j.linalg.api.ndarray.INDArray;`
			`import org.nd4j.linalg.factory.Nd4j;`
			`import org.nd4j.linalg.indexing.INDArrayIndex;`
			`import org.nd4j.linalg.indexing.NDArrayIndex;`

Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public class GravesLSTMParamInitializer extends AbstractParamInitializer {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00
			`private static final GravesLSTMParamInitializer INSTANCE = new GravesLSTMParamInitializer();`

			`public static GravesLSTMParamInitializer getInstance() {`
			`return INSTANCE;`
			`}`

			`/** Weights for previous time step -> current time step connections */`
			`public final static String RECURRENT_WEIGHT_KEY = LSTMParamInitializer.RECURRENT_WEIGHT_KEY;`
			`public final static String BIAS_KEY = LSTMParamInitializer.BIAS_KEY;`
			`public final static String INPUT_WEIGHT_KEY = LSTMParamInitializer.INPUT_WEIGHT_KEY;`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public long numParams(LayerConfiguration l) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf = (org.deeplearning4j.nn.conf.layers.GravesLSTM) l;`

			`val nL = layerConf.getNOut(); //i.e., n neurons in this layer`
			`val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer`

			`val nParams = nLast * (4 * nL) //"input" weights`
			`+ nL * (4 * nL + 3) //recurrent weights`
			`+ 4 * nL; //bias`

			`return nParams;`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public List<String> paramKeys(LayerConfiguration layer) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY, BIAS_KEY);`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public List<String> weightKeys(LayerConfiguration layer) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY);`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public List<String> biasKeys(LayerConfiguration layer) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`return Collections.singletonList(BIAS_KEY);`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public boolean isWeightParam(LayerConfiguration layer, String key) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`return RECURRENT_WEIGHT_KEY.equals(key) \|\| INPUT_WEIGHT_KEY.equals(key);`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public boolean isBiasParam(LayerConfiguration layer, String key) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`return BIAS_KEY.equals(key);`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public Map<String, INDArray> init(LayerConfiguration conf, INDArray paramsView, boolean initializeParams) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`Map<String, INDArray> params = Collections.synchronizedMap(new LinkedHashMap<String, INDArray>());`
			`org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`(org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`double forgetGateInit = layerConf.getForgetGateBiasInit();`

			`val nL = layerConf.getNOut(); //i.e., n neurons in this layer`
			`val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer`

			`conf.addVariable(INPUT_WEIGHT_KEY);`
			`conf.addVariable(RECURRENT_WEIGHT_KEY);`
			`conf.addVariable(BIAS_KEY);`

			`val length = numParams(conf);`
			`if (paramsView.length() != length)`
			`throw new IllegalStateException(`
			`"Expected params view of length " + length + ", got length " + paramsView.length());`

			`val nParamsIn = nLast * (4 * nL);`
			`val nParamsRecurrent = nL * (4 * nL + 3);`
			`val nBias = 4 * nL;`
			`INDArray inputWeightView = paramsView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn));`
			`INDArray recurrentWeightView = paramsView.get(NDArrayIndex.interval(0,0,true),`
			`NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent));`
			`INDArray biasView = paramsView.get(NDArrayIndex.interval(0,0,true),`
			`NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias));`

			`if (initializeParams) {`
			`val fanIn = nL;`
			`val fanOut = nLast + nL;`
			`val inputWShape = new long[] {nLast, 4 * nL};`
			`val recurrentWShape = new long[] {nL, 4 * nL + 3};`

			`IWeightInit rwInit;`
Using @SuperBuilder for LayerConfigurations 2023-04-24 18:09:11 +02:00			`if(layerConf.getWeightInitRecurrent() != null){`
			`rwInit = layerConf.getWeightInitRecurrent();`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`} else {`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`rwInit = layerConf.getWeightInit();`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`}`

Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`params.put(INPUT_WEIGHT_KEY,layerConf.getWeightInit().init(fanIn, fanOut, inputWShape,`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, inputWeightView));`
			`params.put(RECURRENT_WEIGHT_KEY, rwInit.init(fanIn, fanOut, recurrentWShape,`
			`IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, recurrentWeightView));`
			`biasView.put(new INDArrayIndex[] {NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nL, 2 * nL)},`
			`Nd4j.valueArrayOf(new long[]{1, nL}, forgetGateInit)); //Order: input, forget, output, input modulation, i.e., IFOG}`
			`/*The above line initializes the forget gate biases to specified value.`
			`* See Sutskever PhD thesis, pg19:`
			`* "it is important for [the forget gate activations] to be approximately 1 at the early stages of learning,`
			`* which is accomplished by initializing [the forget gate biases] to a large value (such as 5). If it is`
			`* not done, it will be harder to learn long range dependencies because the smaller values of the forget`
			`* gates will create a vanishing gradients problem."`
			`* http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf`
			`*/`
			`params.put(BIAS_KEY, biasView);`
			`} else {`
			`params.put(INPUT_WEIGHT_KEY, WeightInitUtil.reshapeWeights(new long[] {nLast, 4 * nL}, inputWeightView));`
			`params.put(RECURRENT_WEIGHT_KEY,`
			`WeightInitUtil.reshapeWeights(new long[] {nL, 4 * nL + 3}, recurrentWeightView));`
			`params.put(BIAS_KEY, biasView);`
			`}`

			`return params;`
			`}`

			`@Override`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`public Map<String, INDArray> getGradientsFromFlattened(LayerConfiguration conf, INDArray gradientView) {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =`
Refactoring and separation of IModel / Layer 2023-03-23 17:39:00 +01:00			`(org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00
			`val nL = layerConf.getNOut(); //i.e., n neurons in this layer`
			`val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer`

			`val length = numParams(conf);`
			`if (gradientView.length() != length)`
			`throw new IllegalStateException(`
			`"Expected gradient view of length " + length + ", got length " + gradientView.length());`

			`val nParamsIn = nLast * (4 * nL);`
			`val nParamsRecurrent = nL * (4 * nL + 3);`
			`val nBias = 4 * nL;`
			`INDArray inputWeightGradView = gradientView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn))`
			`.reshape('f', nLast, 4 * nL);`
			`INDArray recurrentWeightGradView = gradientView`
			`.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent))`
			`.reshape('f', nL, 4 * nL + 3);`
			`INDArray biasGradView = gradientView.get(NDArrayIndex.interval(0,0,true),`
			`NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias)); //already a row vector`

			`Map<String, INDArray> out = new LinkedHashMap<>();`
			`out.put(INPUT_WEIGHT_KEY, inputWeightGradView);`
			`out.put(RECURRENT_WEIGHT_KEY, recurrentWeightGradView);`
			`out.put(BIAS_KEY, biasGradView);`

			`return out;`
			`}`
			`}`