2021-02-01 14:31:20 +09:00
|
|
|
/*
|
|
|
|
|
* ******************************************************************************
|
|
|
|
|
* *
|
|
|
|
|
* *
|
|
|
|
|
* * This program and the accompanying materials are made available under the
|
|
|
|
|
* * terms of the Apache License, Version 2.0 which is available at
|
|
|
|
|
* * https://www.apache.org/licenses/LICENSE-2.0.
|
|
|
|
|
* *
|
2021-02-01 17:47:29 +09:00
|
|
|
* * See the NOTICE file distributed with this work for additional
|
|
|
|
|
* * information regarding copyright ownership.
|
2021-02-01 14:31:20 +09:00
|
|
|
* * Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
|
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
|
* * License for the specific language governing permissions and limitations
|
|
|
|
|
* * under the License.
|
|
|
|
|
* *
|
|
|
|
|
* * SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
* *****************************************************************************
|
|
|
|
|
*/
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
|
package org.deeplearning4j.nn.layers;
|
|
|
|
|
|
2023-03-23 17:39:00 +01:00
|
|
|
import java.lang.reflect.Constructor;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import lombok.Getter;
|
|
|
|
|
import lombok.NonNull;
|
|
|
|
|
import lombok.Setter;
|
2020-04-23 01:36:49 +03:00
|
|
|
import lombok.extern.slf4j.Slf4j;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.deeplearning4j.exception.DL4JInvalidInputException;
|
2023-03-23 17:39:00 +01:00
|
|
|
import org.deeplearning4j.nn.api.ITrainableLayer;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.deeplearning4j.nn.api.Layer;
|
|
|
|
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
2023-03-23 17:39:00 +01:00
|
|
|
import org.deeplearning4j.nn.conf.layers.BaseLayerConfiguration;
|
|
|
|
|
import org.deeplearning4j.nn.conf.layers.LayerConfiguration;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.deeplearning4j.nn.gradient.DefaultGradient;
|
|
|
|
|
import org.deeplearning4j.nn.gradient.Gradient;
|
|
|
|
|
import org.deeplearning4j.nn.params.DefaultParamInitializer;
|
|
|
|
|
import org.deeplearning4j.nn.workspace.ArrayType;
|
|
|
|
|
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
|
|
|
|
import org.deeplearning4j.optimize.Solver;
|
|
|
|
|
import org.deeplearning4j.optimize.api.ConvexOptimizer;
|
2023-03-23 17:39:00 +01:00
|
|
|
import org.deeplearning4j.optimize.api.TrainingListener;
|
|
|
|
|
import org.nd4j.common.primitives.Pair;
|
|
|
|
|
import org.nd4j.evaluation.IEvaluation;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.nd4j.linalg.api.buffer.DataType;
|
|
|
|
|
import org.nd4j.linalg.api.memory.MemoryWorkspace;
|
|
|
|
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
|
|
|
|
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNorm;
|
|
|
|
|
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNormBp;
|
2023-03-23 17:39:00 +01:00
|
|
|
import org.nd4j.linalg.dataset.api.DataSet;
|
|
|
|
|
import org.nd4j.linalg.dataset.api.MultiDataSet;
|
|
|
|
|
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
|
|
|
|
|
import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.nd4j.linalg.factory.Nd4j;
|
|
|
|
|
import org.nd4j.linalg.indexing.NDArrayIndex;
|
|
|
|
|
import org.nd4j.linalg.learning.regularization.Regularization;
|
|
|
|
|
|
2023-03-23 17:39:00 +01:00
|
|
|
/** A layer with parameters */
|
2020-04-23 01:36:49 +03:00
|
|
|
@Slf4j
|
2023-03-23 17:39:00 +01:00
|
|
|
public abstract class BaseLayer<LayerConfT extends BaseLayerConfiguration>
|
|
|
|
|
extends AbstractLayer<LayerConfT> implements ITrainableLayer {
|
|
|
|
|
|
|
|
|
|
protected double score = 0.0;
|
|
|
|
|
protected ConvexOptimizer optimizer;
|
|
|
|
|
protected Gradient gradient;
|
|
|
|
|
protected Solver solver;
|
|
|
|
|
protected Map<String, INDArray> weightNoiseParams = new HashMap<>();
|
|
|
|
|
protected INDArray paramsFlattened;
|
|
|
|
|
protected INDArray gradientsFlattened;
|
|
|
|
|
|
|
|
|
|
@Getter @Setter protected Map<String, INDArray> paramTable;
|
|
|
|
|
|
|
|
|
|
@Getter protected transient Map<String, INDArray> gradientViews;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* we put this as a virtual function to access the models paramTable. @Getter @Setter private
|
|
|
|
|
* INDArray params;
|
|
|
|
|
*/
|
|
|
|
|
public BaseLayer(LayerConfiguration conf, DataType dataType) {
|
|
|
|
|
super(conf, dataType);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method executes evaluation of the model against given iterator and evaluation
|
|
|
|
|
* implementations
|
|
|
|
|
*
|
|
|
|
|
* @param iterator
|
|
|
|
|
* @param evaluations
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public <T extends IEvaluation> T[] doEvaluation(DataSetIterator iterator, T... evaluations) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method executes evaluation of the model against given iterator and evaluation
|
|
|
|
|
* implementations
|
|
|
|
|
*
|
|
|
|
|
* @param iterator
|
|
|
|
|
* @param evaluations
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T... evaluations) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Init the model */
|
|
|
|
|
@Override
|
|
|
|
|
public void init() {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Update layer weights and biases with gradient change
|
|
|
|
|
*
|
|
|
|
|
* @param gradient
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void update(Gradient gradient) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Perform one update applying the gradient
|
|
|
|
|
*
|
|
|
|
|
* @param gradient the gradient to apply
|
|
|
|
|
* @param paramType
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void update(INDArray gradient, String paramType) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* the number of parameters for the model
|
|
|
|
|
*
|
|
|
|
|
* @return the number of parameters for the model
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public long numParams() {
|
|
|
|
|
int ret = 0;
|
|
|
|
|
for (INDArray val : paramTable.values()) ret += val.length();
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The current inputs batch size
|
|
|
|
|
*
|
|
|
|
|
* @return the current inputs batch size
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public int batchSize() {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Set the {@link TrainingListener}s for this model. If any listeners have previously been set,
|
|
|
|
|
* they will be replaced by this method
|
|
|
|
|
*
|
|
|
|
|
* @param listeners
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void addTrainingListeners(TrainingListener... listeners) {
|
|
|
|
|
addTrainingListeners(List.of(listeners));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Set the parameters for a given parameter type.
|
|
|
|
|
*
|
|
|
|
|
* @param key the param type key to set
|
|
|
|
|
* @param val the new parameters ndarray
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void setParam(String key, INDArray val) {
|
|
|
|
|
if (paramTable.containsKey(key)) {
|
|
|
|
|
paramTable.get(key).assign(val);
|
|
|
|
|
} else {
|
|
|
|
|
paramTable.put(key, val);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray getParam(String param) {
|
|
|
|
|
return paramTable.get(param);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void setParams(INDArray params) {
|
|
|
|
|
if (params == paramsFlattened) return; // no op
|
|
|
|
|
setParams(params, 'f');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* * The AbstractLayer does not implement Params, ParamTable and GradientView. A RuntimeException
|
|
|
|
|
* * will be triggered when calling this.
|
|
|
|
|
*
|
|
|
|
|
* @return 1d parameter vector
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray getParams() {
|
|
|
|
|
return paramsFlattened;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** */
|
|
|
|
|
@Override
|
|
|
|
|
public void close() {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method fits model with a given DataSet
|
|
|
|
|
*
|
|
|
|
|
* @param dataSet
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void fit(DataSet dataSet) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method fits model with a given MultiDataSet
|
|
|
|
|
*
|
|
|
|
|
* @param dataSet
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void fit(MultiDataSet dataSet) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method fits model with a given DataSetIterator
|
|
|
|
|
*
|
|
|
|
|
* @param iterator
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void fit(DataSetIterator iterator) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method fits model with a given MultiDataSetIterator
|
|
|
|
|
*
|
|
|
|
|
* @param iterator
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void fit(MultiDataSetIterator iterator) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method returns updater state (if applicable), null otherwise
|
|
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray updaterState() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* and others even use \epsilon (epsilon)
|
|
|
|
|
* http://web.cs.swarthmore.edu/~meeden/cs81/s10/BackPropDeriv.pdf
|
|
|
|
|
*
|
|
|
|
|
* @param epsilon w^(L+1)*delta^(L+1). Or, equiv: dC/da, i.e., (dC/dz)*(dz/da) = dC/da, where C is
|
|
|
|
|
* cost function a=sigma(z) is activation.
|
|
|
|
|
* @param workspaceMgr Workspace manager
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public Pair<Gradient, INDArray> backpropGradient(
|
|
|
|
|
INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
assertInputSet(true);
|
|
|
|
|
// If this layer is layer L, then epsilon is (w^(L+1)*(d^(L+1))^T) (or equivalent)
|
|
|
|
|
Pair<INDArray, INDArray> zAndPreNorm = preOutputWithPreNorm(true, true, workspaceMgr);
|
|
|
|
|
INDArray z =
|
|
|
|
|
zAndPreNorm.getFirst(); // Note: using preOutput(INDArray) can't be used as this does a
|
|
|
|
|
// setInput(input) and resets the 'appliedDropout' flag
|
|
|
|
|
INDArray preNorm = zAndPreNorm.getSecond();
|
|
|
|
|
INDArray delta =
|
|
|
|
|
getTypedLayerConfiguration()
|
2023-04-24 18:09:11 +02:00
|
|
|
.getActivation()
|
2023-03-23 17:39:00 +01:00
|
|
|
.backprop(z, epsilon)
|
|
|
|
|
.getFirst(); // TODO handle activation function params
|
|
|
|
|
|
|
|
|
|
if (maskArray != null) {
|
|
|
|
|
applyMask(delta);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Gradient ret = new DefaultGradient();
|
|
|
|
|
|
|
|
|
|
if (hasBias()) {
|
|
|
|
|
INDArray biasGrad = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
|
|
|
|
|
delta.sum(biasGrad, 0); // biasGrad is initialized/zeroed first
|
|
|
|
|
ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, true, workspaceMgr);
|
|
|
|
|
|
|
|
|
|
INDArray epsilonNext =
|
|
|
|
|
workspaceMgr.createUninitialized(
|
|
|
|
|
ArrayType.ACTIVATION_GRAD,
|
|
|
|
|
delta.dataType(),
|
|
|
|
|
new long[] {W.size(0), delta.size(0)},
|
|
|
|
|
'f');
|
|
|
|
|
if (hasLayerNorm()) {
|
|
|
|
|
INDArray g = getParam(DefaultParamInitializer.GAIN_KEY);
|
|
|
|
|
|
|
|
|
|
INDArray dldg = gradientViews.get(DefaultParamInitializer.GAIN_KEY);
|
|
|
|
|
Nd4j.getExecutioner().exec(new LayerNormBp(preNorm, g, delta, delta, dldg, true, 1));
|
|
|
|
|
ret.gradientForVariable().put(DefaultParamInitializer.GAIN_KEY, dldg);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
epsilonNext =
|
|
|
|
|
W.mmuli(delta.transpose(), epsilonNext)
|
|
|
|
|
.transpose(); // W.mmul(delta.transpose()).transpose();
|
|
|
|
|
|
|
|
|
|
INDArray weightGrad = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY); // f order
|
|
|
|
|
Nd4j.gemm(
|
|
|
|
|
getInput().castTo(weightGrad.dataType()),
|
|
|
|
|
delta,
|
|
|
|
|
weightGrad,
|
|
|
|
|
true,
|
|
|
|
|
false,
|
|
|
|
|
1.0,
|
|
|
|
|
0.0); // TODO avoid castTo?
|
|
|
|
|
ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
|
|
|
|
|
|
weightNoiseParams.clear();
|
|
|
|
|
|
|
|
|
|
epsilonNext = backpropDropOutIfPresent(epsilonNext);
|
|
|
|
|
return new Pair<>(ret, epsilonNext);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void computeGradientAndScore(LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
if (getInput() == null) {
|
|
|
|
|
log.warn("There is no input for this layer '{}'", layerConfiguration);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
INDArray output = activate(true, workspaceMgr);
|
|
|
|
|
setScoreWithZ(output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected void setScoreWithZ(INDArray z) {}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Objective function: the specified objective
|
|
|
|
|
*
|
|
|
|
|
* @return the score for the objective
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public double getScore() {
|
|
|
|
|
return score;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public Gradient gradient() {
|
|
|
|
|
return gradient;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public ConvexOptimizer getOptimizer() {
|
|
|
|
|
if (optimizer == null) {
|
|
|
|
|
Solver solver = new Solver.Builder().model(this).configure(getNetConfiguration()).build();
|
|
|
|
|
this.optimizer = solver.getOptimizer();
|
|
|
|
|
}
|
|
|
|
|
return optimizer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the parameters of the neural network as a flattened row vector
|
|
|
|
|
*
|
|
|
|
|
* @return the parameters of the neural network
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray getModelParams() {
|
|
|
|
|
return paramsFlattened;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setParamsTable(INDArray paramsTable) {
|
|
|
|
|
if (paramsTable == paramsFlattened) {
|
|
|
|
|
return; // no op
|
|
|
|
|
}
|
|
|
|
|
setParams(paramsTable, 'f');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected void setParams(INDArray params, char order) {
|
|
|
|
|
if (params == null) {
|
|
|
|
|
log.trace(
|
|
|
|
|
"setParams(INDArray params, char order): params is null. Skipping setParams in Layer {}[{}] at index {}",
|
2023-04-24 18:09:11 +02:00
|
|
|
getLayerConfiguration().getName(),
|
2023-03-23 17:39:00 +01:00
|
|
|
getClass().getSimpleName(),
|
|
|
|
|
getIndex());
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-04-24 18:09:11 +02:00
|
|
|
Set<String> parameterList = layerConfiguration.getVariables(); // netWideVariables();
|
2023-03-23 17:39:00 +01:00
|
|
|
int length = 0;
|
|
|
|
|
for (String s : parameterList) {
|
|
|
|
|
length += getParam(s).length();
|
|
|
|
|
}
|
|
|
|
|
if (params.length() != length) {
|
|
|
|
|
throw new IllegalArgumentException(
|
|
|
|
|
"Unable to set parameters: must be of length "
|
|
|
|
|
+ length
|
|
|
|
|
+ ", got params of length "
|
|
|
|
|
+ params.length()
|
|
|
|
|
+ " - "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
int idx = 0;
|
|
|
|
|
Set<String> paramKeySet = this.getParamTable().keySet();
|
|
|
|
|
for (String s : paramKeySet) {
|
|
|
|
|
INDArray param = getParam(s);
|
|
|
|
|
INDArray get =
|
|
|
|
|
params.get(NDArrayIndex.point(0), NDArrayIndex.interval(idx, idx + param.length()));
|
|
|
|
|
if (param.length() != get.length()) {
|
|
|
|
|
throw new IllegalStateException(
|
|
|
|
|
"Parameter "
|
|
|
|
|
+ s
|
|
|
|
|
+ " should have been of length "
|
|
|
|
|
+ param.length()
|
|
|
|
|
+ " but was "
|
|
|
|
|
+ get.length()
|
|
|
|
|
+ " - "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
param.assign(
|
|
|
|
|
get.reshape(
|
|
|
|
|
order,
|
|
|
|
|
param.shape())); // Use assign due to backprop params being a view of a larger array
|
|
|
|
|
idx += param.length();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void setParamsViewArray(INDArray params) {
|
|
|
|
|
if (this.getParamTable() != null && params.length() != numParams()) {
|
|
|
|
|
throw new IllegalArgumentException(
|
|
|
|
|
"Invalid input: expect params of length "
|
|
|
|
|
+ numParams()
|
|
|
|
|
+ ", got params of length "
|
|
|
|
|
+ params.length()
|
|
|
|
|
+ " - "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
this.paramsFlattened = params;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public Map<String, INDArray> getParamTable(boolean isBackprop) {
|
|
|
|
|
return paramTable;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray getGradientsViewArray() {
|
|
|
|
|
return gradientsFlattened;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void setBackpropGradientsViewArray(INDArray gradients) {
|
|
|
|
|
if (this.getParamTable() != null && gradients.length() != numParams()) {
|
|
|
|
|
throw new IllegalArgumentException(
|
|
|
|
|
"Invalid input: expect gradients array of length "
|
|
|
|
|
+ numParams(true)
|
|
|
|
|
+ ", got array of length "
|
|
|
|
|
+ gradients.length()
|
|
|
|
|
+ " - "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.gradientsFlattened = gradients;
|
|
|
|
|
this.gradientViews =
|
|
|
|
|
layerConfiguration.initializer().getGradientsFromFlattened(layerConfiguration, gradients);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the parameter, after applying any weight noise (such as DropConnect) if necessary. Note
|
|
|
|
|
* that during training, this will store the post-noise parameters, as these should be used for
|
|
|
|
|
* both forward pass and backprop, for a single iteration. Consequently, the parameters (post
|
|
|
|
|
* noise) should be cleared after each training iteration
|
|
|
|
|
*
|
|
|
|
|
* @param param Parameter key
|
|
|
|
|
* @param training If true: during training
|
|
|
|
|
* @return The parameter, after applying any noise
|
|
|
|
|
*/
|
|
|
|
|
protected INDArray getParamWithNoise(
|
|
|
|
|
@NonNull String param, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
INDArray p;
|
|
|
|
|
LayerConfiguration lconf = getLayerConfiguration();
|
|
|
|
|
if (lconf.getWeightNoise() != null) {
|
|
|
|
|
if (training && weightNoiseParams.size() > 0 && weightNoiseParams.containsKey(param)) {
|
|
|
|
|
// Re-use these weights for both forward pass and backprop - don't want to use 2 different
|
|
|
|
|
// params here
|
|
|
|
|
// These should be cleared during backprop
|
|
|
|
|
return weightNoiseParams.get(param);
|
|
|
|
|
} else {
|
|
|
|
|
try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
|
|
|
|
|
p =
|
|
|
|
|
lconf
|
|
|
|
|
.getWeightNoise()
|
|
|
|
|
.getParameter(
|
|
|
|
|
this, param, getIterationCount(), getEpochCount(), training, workspaceMgr);
|
2019-06-06 15:21:15 +03:00
|
|
|
}
|
2023-03-23 17:39:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (training) {
|
|
|
|
|
// Store for re-use in backprop
|
|
|
|
|
weightNoiseParams.put(param, p);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return getParam(param);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected INDArray preOutput(boolean training, LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
return preOutputWithPreNorm(training, false, workspaceMgr).getFirst();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected Pair<INDArray, INDArray> preOutputWithPreNorm(
|
|
|
|
|
boolean training, boolean forBackprop, @NonNull LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
assertInputSet(forBackprop);
|
|
|
|
|
applyDropOutIfNecessary(training, workspaceMgr);
|
|
|
|
|
INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, training, workspaceMgr);
|
|
|
|
|
INDArray b = getParamWithNoise(DefaultParamInitializer.BIAS_KEY, training, workspaceMgr);
|
|
|
|
|
INDArray g = (hasLayerNorm() ? getParam(DefaultParamInitializer.GAIN_KEY) : null);
|
|
|
|
|
|
|
|
|
|
INDArray input = getInput().castTo(dataType);
|
|
|
|
|
|
|
|
|
|
// Input validation:
|
|
|
|
|
if (input.rank() != 2 || input.columns() != W.rows()) {
|
|
|
|
|
if (input.rank() != 2) {
|
|
|
|
|
throw new DL4JInvalidInputException(
|
|
|
|
|
"Input that is not a matrix; expected matrix (rank 2), got rank "
|
|
|
|
|
+ input.rank()
|
|
|
|
|
+ " array with shape "
|
|
|
|
|
+ Arrays.toString(input.shape())
|
|
|
|
|
+ ". Missing preprocessor or wrong input type? "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
throw new DL4JInvalidInputException(
|
|
|
|
|
"Input size ("
|
|
|
|
|
+ input.columns()
|
|
|
|
|
+ " columns; shape = "
|
|
|
|
|
+ Arrays.toString(input.shape())
|
|
|
|
|
+ ") is invalid: does not match layer input size (layer # inputs = "
|
|
|
|
|
+ W.size(0)
|
|
|
|
|
+ ") "
|
|
|
|
|
+ layerId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
INDArray ret =
|
|
|
|
|
workspaceMgr.createUninitialized(
|
|
|
|
|
ArrayType.ACTIVATIONS, W.dataType(), input.size(0), W.size(1));
|
|
|
|
|
input
|
|
|
|
|
.castTo(ret.dataType())
|
|
|
|
|
.mmuli(
|
|
|
|
|
W, ret); // TODO Can we avoid this cast? (It sohuld be a no op if not required, however)
|
|
|
|
|
|
|
|
|
|
INDArray preNorm = ret;
|
|
|
|
|
if (hasLayerNorm()) {
|
|
|
|
|
preNorm = (forBackprop ? ret.dup(ret.ordering()) : ret);
|
|
|
|
|
Nd4j.getExecutioner().exec(new LayerNorm(preNorm, g, ret, true, 1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hasBias()) {
|
|
|
|
|
ret.addiRowVector(b);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (maskArray != null) {
|
|
|
|
|
applyMask(ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new Pair<>(ret, preNorm);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
INDArray z = preOutput(training, workspaceMgr); // (Input*Weights) + bias
|
|
|
|
|
INDArray ret = getTypedLayerConfiguration().getActivationFn().getActivation(z, training);
|
|
|
|
|
|
|
|
|
|
if (maskArray != null) {
|
|
|
|
|
applyMask(ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public INDArray activate(
|
|
|
|
|
@NonNull INDArray input, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
setInput(input, workspaceMgr);
|
|
|
|
|
return activate(training, workspaceMgr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public double calcRegularizationScore(boolean backpropParamsOnly) {
|
|
|
|
|
double scoreSum = 0.0;
|
|
|
|
|
for (Map.Entry<String, INDArray> e : getParamTable().entrySet()) {
|
|
|
|
|
List<Regularization> l = getTypedLayerConfiguration().getRegularizationByParam(e.getKey());
|
|
|
|
|
if (l == null || l.isEmpty()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
for (Regularization r : l) {
|
|
|
|
|
scoreSum += r.score(e.getValue(), getIterationCount(), getEpochCount());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return scoreSum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public Layer clone() {
|
|
|
|
|
Layer layer = null;
|
|
|
|
|
try {
|
|
|
|
|
Constructor c = getClass().getConstructor(NeuralNetConfiguration.class);
|
|
|
|
|
layer = (Layer) c.newInstance(layerConfiguration);
|
|
|
|
|
Map<String, INDArray> linkedTable = new LinkedHashMap<>();
|
|
|
|
|
for (Map.Entry<String, INDArray> entry : getParamTable().entrySet()) {
|
|
|
|
|
linkedTable.put(entry.getKey(), entry.getValue().dup());
|
|
|
|
|
}
|
|
|
|
|
layer.setParamTable(linkedTable);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
log.error("", e);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return layer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void fit(INDArray input, LayerWorkspaceMgr workspaceMgr) {
|
|
|
|
|
if (input != null) {
|
|
|
|
|
setInput(input, workspaceMgr);
|
|
|
|
|
applyDropOutIfNecessary(true, workspaceMgr);
|
|
|
|
|
}
|
|
|
|
|
if (solver == null) {
|
|
|
|
|
solver =
|
|
|
|
|
new Solver.Builder()
|
|
|
|
|
.model(this)
|
|
|
|
|
.configure(getNetConfiguration())
|
|
|
|
|
.listeners(getTrainingListeners())
|
|
|
|
|
.build();
|
|
|
|
|
}
|
|
|
|
|
this.optimizer = solver.getOptimizer();
|
|
|
|
|
solver.optimize(workspaceMgr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return getClass().getName()
|
|
|
|
|
+ "{"
|
|
|
|
|
+ "conf="
|
|
|
|
|
+ layerConfiguration
|
|
|
|
|
+ ", score="
|
|
|
|
|
+ score
|
|
|
|
|
+ ", optimizer="
|
|
|
|
|
+ optimizer
|
|
|
|
|
+ ", listeners="
|
|
|
|
|
+ trainingListeners
|
|
|
|
|
+ '}';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void clear() {
|
|
|
|
|
super.clear();
|
|
|
|
|
weightNoiseParams.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void clearNoiseWeightParams() {
|
|
|
|
|
weightNoiseParams.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Does this layer have no bias term? Many layers (dense, convolutional, output, embedding) have
|
|
|
|
|
* biases by default, but no-bias versions are possible via configuration
|
|
|
|
|
*
|
|
|
|
|
* @return True if a bias term is present, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
public boolean hasBias() {
|
|
|
|
|
// Overridden by layers supporting no bias mode: dense, output, convolutional, embedding
|
2023-05-08 09:34:44 +02:00
|
|
|
//return true;
|
2023-03-23 17:39:00 +01:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Does this layer support and is it enabled layer normalization? Only Dense and SimpleRNN Layers
|
|
|
|
|
* support layer normalization.
|
|
|
|
|
*
|
|
|
|
|
* @return True if layer normalization is enabled on this layer, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
public boolean hasLayerNorm() {
|
|
|
|
|
// Overridden by layers supporting layer normalization.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2019-06-06 15:21:15 +03:00
|
|
|
}
|