2021-02-01 14:31:20 +09:00
|
|
|
/*
|
|
|
|
* ******************************************************************************
|
|
|
|
* *
|
|
|
|
* *
|
|
|
|
* * This program and the accompanying materials are made available under the
|
|
|
|
* * terms of the Apache License, Version 2.0 which is available at
|
|
|
|
* * https://www.apache.org/licenses/LICENSE-2.0.
|
|
|
|
* *
|
2021-02-01 17:47:29 +09:00
|
|
|
* * See the NOTICE file distributed with this work for additional
|
|
|
|
* * information regarding copyright ownership.
|
2021-02-01 14:31:20 +09:00
|
|
|
* * Unless required by applicable law or agreed to in writing, software
|
|
|
|
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
* * License for the specific language governing permissions and limitations
|
|
|
|
* * under the License.
|
|
|
|
* *
|
|
|
|
* * SPDX-License-Identifier: Apache-2.0
|
|
|
|
* *****************************************************************************
|
|
|
|
*/
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
package org.deeplearning4j.nn.params;
|
|
|
|
|
2023-05-08 09:34:44 +02:00
|
|
|
import java.util.*;
|
2019-06-06 15:21:15 +03:00
|
|
|
import lombok.val;
|
2023-03-23 17:39:00 +01:00
|
|
|
import org.deeplearning4j.nn.api.AbstractParamInitializer;
|
|
|
|
import org.deeplearning4j.nn.conf.layers.LayerConfiguration;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.deeplearning4j.nn.weights.IWeightInit;
|
|
|
|
import org.deeplearning4j.nn.weights.WeightInitUtil;
|
|
|
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
|
|
|
import org.nd4j.linalg.factory.Nd4j;
|
|
|
|
import org.nd4j.linalg.indexing.INDArrayIndex;
|
|
|
|
import org.nd4j.linalg.indexing.NDArrayIndex;
|
|
|
|
|
2023-03-23 17:39:00 +01:00
|
|
|
public class GravesLSTMParamInitializer extends AbstractParamInitializer {
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
private static final GravesLSTMParamInitializer INSTANCE = new GravesLSTMParamInitializer();
|
|
|
|
|
|
|
|
public static GravesLSTMParamInitializer getInstance() {
|
|
|
|
return INSTANCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Weights for previous time step -> current time step connections */
|
|
|
|
public final static String RECURRENT_WEIGHT_KEY = LSTMParamInitializer.RECURRENT_WEIGHT_KEY;
|
|
|
|
public final static String BIAS_KEY = LSTMParamInitializer.BIAS_KEY;
|
|
|
|
public final static String INPUT_WEIGHT_KEY = LSTMParamInitializer.INPUT_WEIGHT_KEY;
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public long numParams(LayerConfiguration l) {
|
2019-06-06 15:21:15 +03:00
|
|
|
org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf = (org.deeplearning4j.nn.conf.layers.GravesLSTM) l;
|
|
|
|
|
|
|
|
val nL = layerConf.getNOut(); //i.e., n neurons in this layer
|
|
|
|
val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer
|
|
|
|
|
|
|
|
val nParams = nLast * (4 * nL) //"input" weights
|
|
|
|
+ nL * (4 * nL + 3) //recurrent weights
|
|
|
|
+ 4 * nL; //bias
|
|
|
|
|
|
|
|
return nParams;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public List<String> paramKeys(LayerConfiguration layer) {
|
2019-06-06 15:21:15 +03:00
|
|
|
return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY, BIAS_KEY);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public List<String> weightKeys(LayerConfiguration layer) {
|
2019-06-06 15:21:15 +03:00
|
|
|
return Arrays.asList(INPUT_WEIGHT_KEY, RECURRENT_WEIGHT_KEY);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public List<String> biasKeys(LayerConfiguration layer) {
|
2019-06-06 15:21:15 +03:00
|
|
|
return Collections.singletonList(BIAS_KEY);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public boolean isWeightParam(LayerConfiguration layer, String key) {
|
2019-06-06 15:21:15 +03:00
|
|
|
return RECURRENT_WEIGHT_KEY.equals(key) || INPUT_WEIGHT_KEY.equals(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public boolean isBiasParam(LayerConfiguration layer, String key) {
|
2019-06-06 15:21:15 +03:00
|
|
|
return BIAS_KEY.equals(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public Map<String, INDArray> init(LayerConfiguration conf, INDArray paramsView, boolean initializeParams) {
|
2019-06-06 15:21:15 +03:00
|
|
|
Map<String, INDArray> params = Collections.synchronizedMap(new LinkedHashMap<String, INDArray>());
|
|
|
|
org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =
|
2023-03-23 17:39:00 +01:00
|
|
|
(org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;
|
2019-06-06 15:21:15 +03:00
|
|
|
double forgetGateInit = layerConf.getForgetGateBiasInit();
|
|
|
|
|
|
|
|
val nL = layerConf.getNOut(); //i.e., n neurons in this layer
|
|
|
|
val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer
|
|
|
|
|
|
|
|
conf.addVariable(INPUT_WEIGHT_KEY);
|
|
|
|
conf.addVariable(RECURRENT_WEIGHT_KEY);
|
|
|
|
conf.addVariable(BIAS_KEY);
|
|
|
|
|
|
|
|
val length = numParams(conf);
|
|
|
|
if (paramsView.length() != length)
|
|
|
|
throw new IllegalStateException(
|
|
|
|
"Expected params view of length " + length + ", got length " + paramsView.length());
|
|
|
|
|
|
|
|
val nParamsIn = nLast * (4 * nL);
|
|
|
|
val nParamsRecurrent = nL * (4 * nL + 3);
|
|
|
|
val nBias = 4 * nL;
|
|
|
|
INDArray inputWeightView = paramsView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn));
|
|
|
|
INDArray recurrentWeightView = paramsView.get(NDArrayIndex.interval(0,0,true),
|
|
|
|
NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent));
|
|
|
|
INDArray biasView = paramsView.get(NDArrayIndex.interval(0,0,true),
|
|
|
|
NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias));
|
|
|
|
|
|
|
|
if (initializeParams) {
|
|
|
|
val fanIn = nL;
|
|
|
|
val fanOut = nLast + nL;
|
|
|
|
val inputWShape = new long[] {nLast, 4 * nL};
|
|
|
|
val recurrentWShape = new long[] {nL, 4 * nL + 3};
|
|
|
|
|
|
|
|
IWeightInit rwInit;
|
2023-04-24 18:09:11 +02:00
|
|
|
if(layerConf.getWeightInitRecurrent() != null){
|
|
|
|
rwInit = layerConf.getWeightInitRecurrent();
|
2019-06-06 15:21:15 +03:00
|
|
|
} else {
|
2023-03-23 17:39:00 +01:00
|
|
|
rwInit = layerConf.getWeightInit();
|
2019-06-06 15:21:15 +03:00
|
|
|
}
|
|
|
|
|
2023-03-23 17:39:00 +01:00
|
|
|
params.put(INPUT_WEIGHT_KEY,layerConf.getWeightInit().init(fanIn, fanOut, inputWShape,
|
2019-06-06 15:21:15 +03:00
|
|
|
IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, inputWeightView));
|
|
|
|
params.put(RECURRENT_WEIGHT_KEY, rwInit.init(fanIn, fanOut, recurrentWShape,
|
|
|
|
IWeightInit.DEFAULT_WEIGHT_INIT_ORDER, recurrentWeightView));
|
|
|
|
biasView.put(new INDArrayIndex[] {NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nL, 2 * nL)},
|
|
|
|
Nd4j.valueArrayOf(new long[]{1, nL}, forgetGateInit)); //Order: input, forget, output, input modulation, i.e., IFOG}
|
|
|
|
/*The above line initializes the forget gate biases to specified value.
|
|
|
|
* See Sutskever PhD thesis, pg19:
|
|
|
|
* "it is important for [the forget gate activations] to be approximately 1 at the early stages of learning,
|
|
|
|
* which is accomplished by initializing [the forget gate biases] to a large value (such as 5). If it is
|
|
|
|
* not done, it will be harder to learn long range dependencies because the smaller values of the forget
|
|
|
|
* gates will create a vanishing gradients problem."
|
|
|
|
* http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf
|
|
|
|
*/
|
|
|
|
params.put(BIAS_KEY, biasView);
|
|
|
|
} else {
|
|
|
|
params.put(INPUT_WEIGHT_KEY, WeightInitUtil.reshapeWeights(new long[] {nLast, 4 * nL}, inputWeightView));
|
|
|
|
params.put(RECURRENT_WEIGHT_KEY,
|
|
|
|
WeightInitUtil.reshapeWeights(new long[] {nL, 4 * nL + 3}, recurrentWeightView));
|
|
|
|
params.put(BIAS_KEY, biasView);
|
|
|
|
}
|
|
|
|
|
|
|
|
return params;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-03-23 17:39:00 +01:00
|
|
|
public Map<String, INDArray> getGradientsFromFlattened(LayerConfiguration conf, INDArray gradientView) {
|
2019-06-06 15:21:15 +03:00
|
|
|
org.deeplearning4j.nn.conf.layers.GravesLSTM layerConf =
|
2023-03-23 17:39:00 +01:00
|
|
|
(org.deeplearning4j.nn.conf.layers.GravesLSTM) conf;
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
val nL = layerConf.getNOut(); //i.e., n neurons in this layer
|
|
|
|
val nLast = layerConf.getNIn(); //i.e., n neurons in previous layer
|
|
|
|
|
|
|
|
val length = numParams(conf);
|
|
|
|
if (gradientView.length() != length)
|
|
|
|
throw new IllegalStateException(
|
|
|
|
"Expected gradient view of length " + length + ", got length " + gradientView.length());
|
|
|
|
|
|
|
|
val nParamsIn = nLast * (4 * nL);
|
|
|
|
val nParamsRecurrent = nL * (4 * nL + 3);
|
|
|
|
val nBias = 4 * nL;
|
|
|
|
INDArray inputWeightGradView = gradientView.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(0, nParamsIn))
|
|
|
|
.reshape('f', nLast, 4 * nL);
|
|
|
|
INDArray recurrentWeightGradView = gradientView
|
|
|
|
.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(nParamsIn, nParamsIn + nParamsRecurrent))
|
|
|
|
.reshape('f', nL, 4 * nL + 3);
|
|
|
|
INDArray biasGradView = gradientView.get(NDArrayIndex.interval(0,0,true),
|
|
|
|
NDArrayIndex.interval(nParamsIn + nParamsRecurrent, nParamsIn + nParamsRecurrent + nBias)); //already a row vector
|
|
|
|
|
|
|
|
Map<String, INDArray> out = new LinkedHashMap<>();
|
|
|
|
out.put(INPUT_WEIGHT_KEY, inputWeightGradView);
|
|
|
|
out.put(RECURRENT_WEIGHT_KEY, recurrentWeightGradView);
|
|
|
|
out.put(BIAS_KEY, biasGradView);
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
}
|