680 lines
20 KiB
Java
Raw Normal View History

2021-02-01 14:31:20 +09:00
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
2021-02-01 17:47:29 +09:00
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
2021-02-01 14:31:20 +09:00
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
2019-06-06 15:21:15 +03:00
package org.deeplearning4j.nn.layers;
import java.lang.reflect.Constructor;
import java.util.*;
import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
2019-06-06 15:21:15 +03:00
import org.deeplearning4j.exception.DL4JInvalidInputException;
import org.deeplearning4j.nn.api.ITrainableLayer;
2019-06-06 15:21:15 +03:00
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.layers.BaseLayerConfiguration;
import org.deeplearning4j.nn.conf.layers.LayerConfiguration;
2019-06-06 15:21:15 +03:00
import org.deeplearning4j.nn.gradient.DefaultGradient;
import org.deeplearning4j.nn.gradient.Gradient;
import org.deeplearning4j.nn.params.DefaultParamInitializer;
import org.deeplearning4j.nn.workspace.ArrayType;
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
import org.deeplearning4j.optimize.Solver;
import org.deeplearning4j.optimize.api.ConvexOptimizer;
import org.deeplearning4j.optimize.api.TrainingListener;
import org.nd4j.common.primitives.Pair;
import org.nd4j.evaluation.IEvaluation;
2019-06-06 15:21:15 +03:00
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.memory.MemoryWorkspace;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNorm;
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNormBp;
import org.nd4j.linalg.dataset.api.DataSet;
import org.nd4j.linalg.dataset.api.MultiDataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
2019-06-06 15:21:15 +03:00
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.NDArrayIndex;
import org.nd4j.linalg.learning.regularization.Regularization;
/** A layer with parameters */
@Slf4j
public abstract class BaseLayer<LayerConfT extends BaseLayerConfiguration>
extends AbstractLayer<LayerConfT> implements ITrainableLayer {
protected double score = 0.0;
protected ConvexOptimizer optimizer;
protected Gradient gradient;
protected Solver solver;
protected Map<String, INDArray> weightNoiseParams = new HashMap<>();
protected INDArray paramsFlattened;
protected INDArray gradientsFlattened;
@Getter @Setter protected Map<String, INDArray> paramTable;
@Getter protected transient Map<String, INDArray> gradientViews;
/**
* we put this as a virtual function to access the models paramTable. @Getter @Setter private
* INDArray params;
*/
public BaseLayer(LayerConfiguration conf, DataType dataType) {
super(conf, dataType);
}
/**
* This method executes evaluation of the model against given iterator and evaluation
* implementations
*
* @param iterator
* @param evaluations
*/
@Override
public <T extends IEvaluation> T[] doEvaluation(DataSetIterator iterator, T... evaluations) {
return null;
}
/**
* This method executes evaluation of the model against given iterator and evaluation
* implementations
*
* @param iterator
* @param evaluations
*/
@Override
public <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T... evaluations) {
return null;
}
/** Init the model */
@Override
public void init() {}
/**
* Update layer weights and biases with gradient change
*
* @param gradient
*/
@Override
public void update(Gradient gradient) {}
/**
* Perform one update applying the gradient
*
* @param gradient the gradient to apply
* @param paramType
*/
@Override
public void update(INDArray gradient, String paramType) {}
/**
* the number of parameters for the model
*
* @return the number of parameters for the model
*/
@Override
public long numParams() {
int ret = 0;
for (INDArray val : paramTable.values()) ret += val.length();
return ret;
}
/**
* The current inputs batch size
*
* @return the current inputs batch size
*/
@Override
public int batchSize() {
return 0;
}
/**
* Set the {@link TrainingListener}s for this model. If any listeners have previously been set,
* they will be replaced by this method
*
* @param listeners
*/
@Override
public void addTrainingListeners(TrainingListener... listeners) {
addTrainingListeners(List.of(listeners));
}
/**
* Set the parameters for a given parameter type.
*
* @param key the param type key to set
* @param val the new parameters ndarray
*/
@Override
public void setParam(String key, INDArray val) {
if (paramTable.containsKey(key)) {
paramTable.get(key).assign(val);
} else {
paramTable.put(key, val);
}
}
@Override
public INDArray getParam(String param) {
return paramTable.get(param);
}
@Override
public void setParams(INDArray params) {
if (params == paramsFlattened) return; // no op
setParams(params, 'f');
}
/**
* * The AbstractLayer does not implement Params, ParamTable and GradientView. A RuntimeException
* * will be triggered when calling this.
*
* @return 1d parameter vector
*/
@Override
public INDArray getParams() {
return paramsFlattened;
}
/** */
@Override
public void close() {}
/**
* This method fits model with a given DataSet
*
* @param dataSet
*/
@Override
public void fit(DataSet dataSet) {}
/**
* This method fits model with a given MultiDataSet
*
* @param dataSet
*/
@Override
public void fit(MultiDataSet dataSet) {}
/**
* This method fits model with a given DataSetIterator
*
* @param iterator
*/
@Override
public void fit(DataSetIterator iterator) {}
/**
* This method fits model with a given MultiDataSetIterator
*
* @param iterator
*/
@Override
public void fit(MultiDataSetIterator iterator) {}
/**
* This method returns updater state (if applicable), null otherwise
*
* @return
*/
@Override
public INDArray updaterState() {
return null;
}
/**
* and others even use \epsilon (epsilon)
* http://web.cs.swarthmore.edu/~meeden/cs81/s10/BackPropDeriv.pdf
*
* @param epsilon w^(L+1)*delta^(L+1). Or, equiv: dC/da, i.e., (dC/dz)*(dz/da) = dC/da, where C is
* cost function a=sigma(z) is activation.
* @param workspaceMgr Workspace manager
* @return
*/
@Override
public Pair<Gradient, INDArray> backpropGradient(
INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
assertInputSet(true);
// If this layer is layer L, then epsilon is (w^(L+1)*(d^(L+1))^T) (or equivalent)
Pair<INDArray, INDArray> zAndPreNorm = preOutputWithPreNorm(true, true, workspaceMgr);
INDArray z =
zAndPreNorm.getFirst(); // Note: using preOutput(INDArray) can't be used as this does a
// setInput(input) and resets the 'appliedDropout' flag
INDArray preNorm = zAndPreNorm.getSecond();
INDArray delta =
getTypedLayerConfiguration()
.getActivation()
.backprop(z, epsilon)
.getFirst(); // TODO handle activation function params
if (maskArray != null) {
applyMask(delta);
}
Gradient ret = new DefaultGradient();
if (hasBias()) {
INDArray biasGrad = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
delta.sum(biasGrad, 0); // biasGrad is initialized/zeroed first
ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGrad);
}
INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, true, workspaceMgr);
INDArray epsilonNext =
workspaceMgr.createUninitialized(
ArrayType.ACTIVATION_GRAD,
delta.dataType(),
new long[] {W.size(0), delta.size(0)},
'f');
if (hasLayerNorm()) {
INDArray g = getParam(DefaultParamInitializer.GAIN_KEY);
INDArray dldg = gradientViews.get(DefaultParamInitializer.GAIN_KEY);
Nd4j.getExecutioner().exec(new LayerNormBp(preNorm, g, delta, delta, dldg, true, 1));
ret.gradientForVariable().put(DefaultParamInitializer.GAIN_KEY, dldg);
}
epsilonNext =
W.mmuli(delta.transpose(), epsilonNext)
.transpose(); // W.mmul(delta.transpose()).transpose();
INDArray weightGrad = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY); // f order
Nd4j.gemm(
getInput().castTo(weightGrad.dataType()),
delta,
weightGrad,
true,
false,
1.0,
0.0); // TODO avoid castTo?
ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
weightNoiseParams.clear();
epsilonNext = backpropDropOutIfPresent(epsilonNext);
return new Pair<>(ret, epsilonNext);
}
public void computeGradientAndScore(LayerWorkspaceMgr workspaceMgr) {
if (getInput() == null) {
log.warn("There is no input for this layer '{}'", layerConfiguration);
return;
}
INDArray output = activate(true, workspaceMgr);
setScoreWithZ(output);
}
protected void setScoreWithZ(INDArray z) {}
/**
* Objective function: the specified objective
*
* @return the score for the objective
*/
@Override
public double getScore() {
return score;
}
@Override
public Gradient gradient() {
return gradient;
}
@Override
public ConvexOptimizer getOptimizer() {
if (optimizer == null) {
Solver solver = new Solver.Builder().model(this).configure(getNetConfiguration()).build();
this.optimizer = solver.getOptimizer();
}
return optimizer;
}
/**
* Returns the parameters of the neural network as a flattened row vector
*
* @return the parameters of the neural network
*/
@Override
public INDArray getModelParams() {
return paramsFlattened;
}
public void setParamsTable(INDArray paramsTable) {
if (paramsTable == paramsFlattened) {
return; // no op
}
setParams(paramsTable, 'f');
}
protected void setParams(INDArray params, char order) {
if (params == null) {
log.trace(
"setParams(INDArray params, char order): params is null. Skipping setParams in Layer {}[{}] at index {}",
getLayerConfiguration().getName(),
getClass().getSimpleName(),
getIndex());
return;
}
Set<String> parameterList = layerConfiguration.getVariables(); // netWideVariables();
int length = 0;
for (String s : parameterList) {
length += getParam(s).length();
}
if (params.length() != length) {
throw new IllegalArgumentException(
"Unable to set parameters: must be of length "
+ length
+ ", got params of length "
+ params.length()
+ " - "
+ layerId());
}
int idx = 0;
Set<String> paramKeySet = this.getParamTable().keySet();
for (String s : paramKeySet) {
INDArray param = getParam(s);
INDArray get =
params.get(NDArrayIndex.point(0), NDArrayIndex.interval(idx, idx + param.length()));
if (param.length() != get.length()) {
throw new IllegalStateException(
"Parameter "
+ s
+ " should have been of length "
+ param.length()
+ " but was "
+ get.length()
+ " - "
+ layerId());
}
param.assign(
get.reshape(
order,
param.shape())); // Use assign due to backprop params being a view of a larger array
idx += param.length();
}
}
@Override
public void setParamsViewArray(INDArray params) {
if (this.getParamTable() != null && params.length() != numParams()) {
throw new IllegalArgumentException(
"Invalid input: expect params of length "
+ numParams()
+ ", got params of length "
+ params.length()
+ " - "
+ layerId());
}
this.paramsFlattened = params;
}
@Override
public Map<String, INDArray> getParamTable(boolean isBackprop) {
return paramTable;
}
@Override
public INDArray getGradientsViewArray() {
return gradientsFlattened;
}
@Override
public void setBackpropGradientsViewArray(INDArray gradients) {
if (this.getParamTable() != null && gradients.length() != numParams()) {
throw new IllegalArgumentException(
"Invalid input: expect gradients array of length "
+ numParams(true)
+ ", got array of length "
+ gradients.length()
+ " - "
+ layerId());
}
this.gradientsFlattened = gradients;
this.gradientViews =
layerConfiguration.initializer().getGradientsFromFlattened(layerConfiguration, gradients);
}
/**
* Get the parameter, after applying any weight noise (such as DropConnect) if necessary. Note
* that during training, this will store the post-noise parameters, as these should be used for
* both forward pass and backprop, for a single iteration. Consequently, the parameters (post
* noise) should be cleared after each training iteration
*
* @param param Parameter key
* @param training If true: during training
* @return The parameter, after applying any noise
*/
protected INDArray getParamWithNoise(
@NonNull String param, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
INDArray p;
LayerConfiguration lconf = getLayerConfiguration();
if (lconf.getWeightNoise() != null) {
if (training && weightNoiseParams.size() > 0 && weightNoiseParams.containsKey(param)) {
// Re-use these weights for both forward pass and backprop - don't want to use 2 different
// params here
// These should be cleared during backprop
return weightNoiseParams.get(param);
} else {
try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
p =
lconf
.getWeightNoise()
.getParameter(
this, param, getIterationCount(), getEpochCount(), training, workspaceMgr);
2019-06-06 15:21:15 +03:00
}
}
if (training) {
// Store for re-use in backprop
weightNoiseParams.put(param, p);
}
} else {
return getParam(param);
}
return p;
}
protected INDArray preOutput(boolean training, LayerWorkspaceMgr workspaceMgr) {
return preOutputWithPreNorm(training, false, workspaceMgr).getFirst();
}
protected Pair<INDArray, INDArray> preOutputWithPreNorm(
boolean training, boolean forBackprop, @NonNull LayerWorkspaceMgr workspaceMgr) {
assertInputSet(forBackprop);
applyDropOutIfNecessary(training, workspaceMgr);
INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, training, workspaceMgr);
INDArray b = getParamWithNoise(DefaultParamInitializer.BIAS_KEY, training, workspaceMgr);
INDArray g = (hasLayerNorm() ? getParam(DefaultParamInitializer.GAIN_KEY) : null);
INDArray input = getInput().castTo(dataType);
// Input validation:
if (input.rank() != 2 || input.columns() != W.rows()) {
if (input.rank() != 2) {
throw new DL4JInvalidInputException(
"Input that is not a matrix; expected matrix (rank 2), got rank "
+ input.rank()
+ " array with shape "
+ Arrays.toString(input.shape())
+ ". Missing preprocessor or wrong input type? "
+ layerId());
}
throw new DL4JInvalidInputException(
"Input size ("
+ input.columns()
+ " columns; shape = "
+ Arrays.toString(input.shape())
+ ") is invalid: does not match layer input size (layer # inputs = "
+ W.size(0)
+ ") "
+ layerId());
}
INDArray ret =
workspaceMgr.createUninitialized(
ArrayType.ACTIVATIONS, W.dataType(), input.size(0), W.size(1));
input
.castTo(ret.dataType())
.mmuli(
W, ret); // TODO Can we avoid this cast? (It sohuld be a no op if not required, however)
INDArray preNorm = ret;
if (hasLayerNorm()) {
preNorm = (forBackprop ? ret.dup(ret.ordering()) : ret);
Nd4j.getExecutioner().exec(new LayerNorm(preNorm, g, ret, true, 1));
}
if (hasBias()) {
ret.addiRowVector(b);
}
if (maskArray != null) {
applyMask(ret);
}
return new Pair<>(ret, preNorm);
}
@Override
public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
INDArray z = preOutput(training, workspaceMgr); // (Input*Weights) + bias
INDArray ret = getTypedLayerConfiguration().getActivationFn().getActivation(z, training);
if (maskArray != null) {
applyMask(ret);
}
return ret;
}
@Override
public INDArray activate(
@NonNull INDArray input, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
setInput(input, workspaceMgr);
return activate(training, workspaceMgr);
}
@Override
public double calcRegularizationScore(boolean backpropParamsOnly) {
double scoreSum = 0.0;
for (Map.Entry<String, INDArray> e : getParamTable().entrySet()) {
List<Regularization> l = getTypedLayerConfiguration().getRegularizationByParam(e.getKey());
if (l == null || l.isEmpty()) {
continue;
}
for (Regularization r : l) {
scoreSum += r.score(e.getValue(), getIterationCount(), getEpochCount());
}
}
return scoreSum;
}
@Override
public Layer clone() {
Layer layer = null;
try {
Constructor c = getClass().getConstructor(NeuralNetConfiguration.class);
layer = (Layer) c.newInstance(layerConfiguration);
Map<String, INDArray> linkedTable = new LinkedHashMap<>();
for (Map.Entry<String, INDArray> entry : getParamTable().entrySet()) {
linkedTable.put(entry.getKey(), entry.getValue().dup());
}
layer.setParamTable(linkedTable);
} catch (Exception e) {
log.error("", e);
}
return layer;
}
@Override
public void fit(INDArray input, LayerWorkspaceMgr workspaceMgr) {
if (input != null) {
setInput(input, workspaceMgr);
applyDropOutIfNecessary(true, workspaceMgr);
}
if (solver == null) {
solver =
new Solver.Builder()
.model(this)
.configure(getNetConfiguration())
.listeners(getTrainingListeners())
.build();
}
this.optimizer = solver.getOptimizer();
solver.optimize(workspaceMgr);
}
@Override
public String toString() {
return getClass().getName()
+ "{"
+ "conf="
+ layerConfiguration
+ ", score="
+ score
+ ", optimizer="
+ optimizer
+ ", listeners="
+ trainingListeners
+ '}';
}
@Override
public void clear() {
super.clear();
weightNoiseParams.clear();
}
@Override
public void clearNoiseWeightParams() {
weightNoiseParams.clear();
}
/**
* Does this layer have no bias term? Many layers (dense, convolutional, output, embedding) have
* biases by default, but no-bias versions are possible via configuration
*
* @return True if a bias term is present, false otherwise
*/
public boolean hasBias() {
// Overridden by layers supporting no bias mode: dense, output, convolutional, embedding
2023-05-08 09:34:44 +02:00
//return true;
return true;
}
/**
* Does this layer support and is it enabled layer normalization? Only Dense and SimpleRNN Layers
* support layer normalization.
*
* @return True if layer normalization is enabled on this layer, false otherwise
*/
public boolean hasLayerNorm() {
// Overridden by layers supporting layer normalization.
return false;
}
2019-06-06 15:21:15 +03:00
}