2021-02-01 14:31:20 +09:00
|
|
|
/*
|
|
|
|
* ******************************************************************************
|
|
|
|
* *
|
|
|
|
* *
|
|
|
|
* * This program and the accompanying materials are made available under the
|
|
|
|
* * terms of the Apache License, Version 2.0 which is available at
|
|
|
|
* * https://www.apache.org/licenses/LICENSE-2.0.
|
|
|
|
* *
|
2021-02-01 17:47:29 +09:00
|
|
|
* * See the NOTICE file distributed with this work for additional
|
|
|
|
* * information regarding copyright ownership.
|
2021-02-01 14:31:20 +09:00
|
|
|
* * Unless required by applicable law or agreed to in writing, software
|
|
|
|
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
* * License for the specific language governing permissions and limitations
|
|
|
|
* * under the License.
|
|
|
|
* *
|
|
|
|
* * SPDX-License-Identifier: Apache-2.0
|
|
|
|
* *****************************************************************************
|
|
|
|
*/
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
package org.deeplearning4j.nn.updater;
|
|
|
|
|
|
|
|
import lombok.val;
|
|
|
|
import org.deeplearning4j.BaseDL4JTest;
|
|
|
|
import org.deeplearning4j.nn.api.Layer;
|
|
|
|
import org.deeplearning4j.nn.api.Updater;
|
|
|
|
import org.deeplearning4j.nn.conf.GradientNormalization;
|
|
|
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
|
|
|
import org.deeplearning4j.nn.conf.layers.DenseLayer;
|
|
|
|
import org.deeplearning4j.nn.gradient.DefaultGradient;
|
|
|
|
import org.deeplearning4j.nn.gradient.Gradient;
|
|
|
|
import org.deeplearning4j.nn.params.DefaultParamInitializer;
|
|
|
|
import org.junit.Test;
|
|
|
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
|
|
|
import org.nd4j.linalg.api.shape.Shape;
|
|
|
|
import org.nd4j.linalg.factory.Nd4j;
|
|
|
|
import org.nd4j.linalg.indexing.NDArrayIndex;
|
|
|
|
import org.nd4j.linalg.learning.config.NoOp;
|
|
|
|
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
|
|
|
|
|
|
|
import static org.junit.Assert.*;
|
|
|
|
|
|
|
|
public class TestGradientNormalization extends BaseDL4JTest {
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testRenormalizatonPerLayer() {
|
|
|
|
Nd4j.getRandom().setSeed(12345);
|
|
|
|
|
|
|
|
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
|
|
|
|
.layer(new DenseLayer.Builder().nIn(10).nOut(20)
|
|
|
|
.updater(new NoOp())
|
|
|
|
.gradientNormalization(GradientNormalization.RenormalizeL2PerLayer).build())
|
|
|
|
.build();
|
|
|
|
|
|
|
|
long numParams = conf.getLayer().initializer().numParams(conf);
|
|
|
|
INDArray params = Nd4j.create(1, numParams);
|
|
|
|
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true, params.dataType());
|
|
|
|
INDArray gradArray = Nd4j.rand(1, 220).muli(10).subi(5);
|
|
|
|
layer.setBackpropGradientsViewArray(gradArray);
|
|
|
|
INDArray weightGrad = Shape.newShapeNoCopy(gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, 200)),
|
|
|
|
new int[] {10, 20}, true);
|
|
|
|
INDArray biasGrad = gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(200, 220));
|
|
|
|
INDArray weightGradCopy = weightGrad.dup();
|
|
|
|
INDArray biasGradCopy = biasGrad.dup();
|
|
|
|
Gradient gradient = new DefaultGradient(gradArray);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
|
|
|
Updater updater = UpdaterCreator.getUpdater(layer);
|
|
|
|
updater.update(layer, gradient, 0, 0, 1, LayerWorkspaceMgr.noWorkspaces());
|
|
|
|
|
|
|
|
assertNotEquals(weightGradCopy, weightGrad);
|
|
|
|
assertNotEquals(biasGradCopy, biasGrad);
|
|
|
|
|
|
|
|
double sumSquaresWeight = weightGradCopy.mul(weightGradCopy).sumNumber().doubleValue();
|
|
|
|
double sumSquaresBias = biasGradCopy.mul(biasGradCopy).sumNumber().doubleValue();
|
|
|
|
double sumSquares = sumSquaresWeight + sumSquaresBias;
|
|
|
|
double l2Layer = Math.sqrt(sumSquares);
|
|
|
|
|
|
|
|
INDArray normWeightsExpected = weightGradCopy.div(l2Layer);
|
|
|
|
INDArray normBiasExpected = biasGradCopy.div(l2Layer);
|
|
|
|
|
|
|
|
double l2Weight = gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY).norm2Number().doubleValue();
|
|
|
|
double l2Bias = gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY).norm2Number().doubleValue();
|
|
|
|
assertTrue(!Double.isNaN(l2Weight) && l2Weight > 0.0);
|
|
|
|
assertTrue(!Double.isNaN(l2Bias) && l2Bias > 0.0);
|
|
|
|
assertEquals(normWeightsExpected, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
|
|
|
|
assertEquals(normBiasExpected, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testRenormalizationPerParamType() {
|
|
|
|
Nd4j.getRandom().setSeed(12345);
|
|
|
|
|
|
|
|
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
|
|
|
|
.layer(new DenseLayer.Builder().nIn(10).nOut(20)
|
|
|
|
.updater(new NoOp())
|
|
|
|
.gradientNormalization(GradientNormalization.RenormalizeL2PerParamType).build())
|
|
|
|
.build();
|
|
|
|
|
|
|
|
long numParams = conf.getLayer().initializer().numParams(conf);
|
|
|
|
INDArray params = Nd4j.create(1, numParams);
|
|
|
|
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true, params.dataType());
|
|
|
|
layer.setBackpropGradientsViewArray(Nd4j.create(params.shape()));
|
|
|
|
Updater updater = UpdaterCreator.getUpdater(layer);
|
|
|
|
INDArray weightGrad = Nd4j.rand(10, 20);
|
|
|
|
INDArray biasGrad = Nd4j.rand(1, 20);
|
|
|
|
INDArray weightGradCopy = weightGrad.dup();
|
|
|
|
INDArray biasGradCopy = biasGrad.dup();
|
|
|
|
Gradient gradient = new DefaultGradient();
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
|
|
|
updater.update(layer, gradient, 0, 0, 1, LayerWorkspaceMgr.noWorkspaces());
|
|
|
|
|
|
|
|
INDArray normWeightsExpected = weightGradCopy.div(weightGradCopy.norm2Number());
|
|
|
|
INDArray normBiasExpected = biasGradCopy.div(biasGradCopy.norm2Number());
|
|
|
|
|
|
|
|
assertEquals(normWeightsExpected, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
|
|
|
|
assertEquals(normBiasExpected, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testAbsValueClippingPerElement() {
|
|
|
|
Nd4j.getRandom().setSeed(12345);
|
|
|
|
double threshold = 3;
|
|
|
|
|
|
|
|
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(
|
|
|
|
new DenseLayer.Builder().nIn(10).nOut(20).updater(new NoOp())
|
|
|
|
.gradientNormalization(GradientNormalization.ClipElementWiseAbsoluteValue)
|
|
|
|
.gradientNormalizationThreshold(threshold).build())
|
|
|
|
.build();
|
|
|
|
|
|
|
|
long numParams = conf.getLayer().initializer().numParams(conf);
|
|
|
|
INDArray params = Nd4j.create(1, numParams);
|
|
|
|
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true, params.dataType());
|
|
|
|
INDArray gradArray = Nd4j.rand(1, 220).muli(10).subi(5);
|
|
|
|
layer.setBackpropGradientsViewArray(gradArray);
|
|
|
|
INDArray weightGrad = Shape.newShapeNoCopy(gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, 200)),
|
|
|
|
new int[] {10, 20}, true);
|
|
|
|
INDArray biasGrad = gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(200, 220));
|
|
|
|
INDArray weightGradCopy = weightGrad.dup();
|
|
|
|
INDArray biasGradCopy = biasGrad.dup();
|
|
|
|
Gradient gradient = new DefaultGradient(gradArray);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
|
|
|
Updater updater = UpdaterCreator.getUpdater(layer);
|
|
|
|
updater.update(layer, gradient, 0, 0, 1, LayerWorkspaceMgr.noWorkspaces());
|
|
|
|
|
|
|
|
assertNotEquals(weightGradCopy, weightGrad);
|
|
|
|
assertNotEquals(biasGradCopy, biasGrad);
|
|
|
|
|
|
|
|
INDArray expectedWeightGrad = weightGradCopy.dup();
|
|
|
|
for (int i = 0; i < expectedWeightGrad.length(); i++) {
|
|
|
|
double d = expectedWeightGrad.getDouble(i);
|
|
|
|
if (d > threshold)
|
|
|
|
expectedWeightGrad.putScalar(i, threshold);
|
|
|
|
else if (d < -threshold)
|
|
|
|
expectedWeightGrad.putScalar(i, -threshold);
|
|
|
|
}
|
|
|
|
INDArray expectedBiasGrad = biasGradCopy.dup();
|
|
|
|
for (int i = 0; i < expectedBiasGrad.length(); i++) {
|
|
|
|
double d = expectedBiasGrad.getDouble(i);
|
|
|
|
if (d > threshold)
|
|
|
|
expectedBiasGrad.putScalar(i, threshold);
|
|
|
|
else if (d < -threshold)
|
|
|
|
expectedBiasGrad.putScalar(i, -threshold);
|
|
|
|
}
|
|
|
|
|
|
|
|
assertEquals(expectedWeightGrad, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
|
|
|
|
assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testL2ClippingPerLayer() {
|
|
|
|
Nd4j.getRandom().setSeed(12345);
|
|
|
|
double threshold = 3;
|
|
|
|
|
|
|
|
for (int t = 0; t < 2; t++) {
|
|
|
|
//t=0: small -> no clipping
|
|
|
|
//t=1: large -> clipping
|
|
|
|
|
|
|
|
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(
|
|
|
|
new DenseLayer.Builder().nIn(10).nOut(20).updater(new NoOp())
|
|
|
|
.gradientNormalization(GradientNormalization.ClipL2PerLayer)
|
|
|
|
.gradientNormalizationThreshold(threshold).build())
|
|
|
|
.build();
|
|
|
|
|
|
|
|
val numParams = conf.getLayer().initializer().numParams(conf);
|
|
|
|
INDArray params = Nd4j.create(1, numParams);
|
|
|
|
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true, params.dataType());
|
|
|
|
INDArray gradArray = Nd4j.rand(1, 220).muli(t == 0 ? 0.05 : 10).subi(t == 0 ? 0 : 5);
|
|
|
|
layer.setBackpropGradientsViewArray(gradArray);
|
|
|
|
INDArray weightGrad =
|
|
|
|
Shape.newShapeNoCopy(gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, 200)),
|
|
|
|
new int[] {10, 20}, true);
|
|
|
|
INDArray biasGrad = gradArray.get(NDArrayIndex.point(0), NDArrayIndex.interval(200, 220));
|
|
|
|
INDArray weightGradCopy = weightGrad.dup();
|
|
|
|
INDArray biasGradCopy = biasGrad.dup();
|
|
|
|
Gradient gradient = new DefaultGradient(gradArray);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
|
|
|
double layerGradL2 = gradient.gradient().norm2Number().doubleValue();
|
|
|
|
if (t == 0)
|
|
|
|
assertTrue(layerGradL2 < threshold);
|
|
|
|
else
|
|
|
|
assertTrue(layerGradL2 > threshold);
|
|
|
|
|
|
|
|
Updater updater = UpdaterCreator.getUpdater(layer);
|
|
|
|
updater.update(layer, gradient, 0, 0, 1, LayerWorkspaceMgr.noWorkspaces());
|
|
|
|
|
|
|
|
if (t == 0) {
|
|
|
|
//norm2 < threshold -> no change
|
|
|
|
assertEquals(weightGradCopy, weightGrad);
|
|
|
|
assertEquals(biasGradCopy, biasGrad);
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
//norm2 > threshold -> rescale
|
|
|
|
assertNotEquals(weightGradCopy, weightGrad);
|
|
|
|
assertNotEquals(biasGradCopy, biasGrad);
|
|
|
|
}
|
|
|
|
|
|
|
|
//for above threshold only...
|
|
|
|
double scalingFactor = threshold / layerGradL2;
|
|
|
|
INDArray expectedWeightGrad = weightGradCopy.mul(scalingFactor);
|
|
|
|
INDArray expectedBiasGrad = biasGradCopy.mul(scalingFactor);
|
|
|
|
assertEquals(expectedWeightGrad, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
|
|
|
|
assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testL2ClippingPerParamType() {
|
|
|
|
Nd4j.getRandom().setSeed(12345);
|
|
|
|
double threshold = 3;
|
|
|
|
|
|
|
|
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(
|
|
|
|
new DenseLayer.Builder().nIn(10).nOut(20).updater(new NoOp())
|
|
|
|
.gradientNormalization(GradientNormalization.ClipL2PerParamType)
|
|
|
|
.gradientNormalizationThreshold(threshold).build())
|
|
|
|
.build();
|
|
|
|
|
|
|
|
val numParams = conf.getLayer().initializer().numParams(conf);
|
|
|
|
INDArray params = Nd4j.create(1, numParams);
|
|
|
|
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true, params.dataType());
|
|
|
|
layer.setBackpropGradientsViewArray(Nd4j.create(params.shape()));
|
|
|
|
Updater updater = UpdaterCreator.getUpdater(layer);
|
|
|
|
INDArray weightGrad = Nd4j.rand(10, 20).muli(0.05);
|
|
|
|
INDArray biasGrad = Nd4j.rand(1, 20).muli(10);
|
|
|
|
INDArray weightGradCopy = weightGrad.dup();
|
|
|
|
INDArray biasGradCopy = biasGrad.dup();
|
|
|
|
Gradient gradient = new DefaultGradient();
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
|
|
|
|
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
|
|
|
|
|
|
|
|
double weightL2 = weightGrad.norm2Number().doubleValue();
|
|
|
|
double biasL2 = biasGrad.norm2Number().doubleValue();
|
|
|
|
assertTrue(weightL2 < threshold);
|
|
|
|
assertTrue(biasL2 > threshold);
|
|
|
|
|
|
|
|
updater.update(layer, gradient, 0, 0, 1, LayerWorkspaceMgr.noWorkspaces());
|
|
|
|
|
|
|
|
assertEquals(weightGradCopy, weightGrad); //weight norm2 < threshold -> no change
|
|
|
|
assertNotEquals(biasGradCopy, biasGrad); //bias norm2 > threshold -> rescale
|
|
|
|
|
|
|
|
|
|
|
|
double biasScalingFactor = threshold / biasL2;
|
|
|
|
INDArray expectedBiasGrad = biasGradCopy.mul(biasScalingFactor);
|
|
|
|
assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
|
|
|
|
}
|
|
|
|
}
|