cavis/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/gradientcheck/CuDNNGradientChecks.java

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.deeplearning4j.gradientcheck;

import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.BaseDL4JTest;
import org.deeplearning4j.TestUtils;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.api.OptimizationAlgorithm;
import org.deeplearning4j.nn.conf.ConvolutionMode;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
import org.deeplearning4j.nn.conf.distribution.UniformDistribution;
import org.deeplearning4j.nn.conf.dropout.Dropout;
import org.deeplearning4j.nn.conf.dropout.IDropout;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.*;
import org.deeplearning4j.nn.layers.convolution.ConvolutionHelper;
import org.deeplearning4j.nn.layers.convolution.CudnnConvolutionHelper;
import org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingHelper;
import org.deeplearning4j.nn.layers.dropout.CudnnDropoutHelper;
import org.deeplearning4j.nn.layers.normalization.BatchNormalizationHelper;
import org.deeplearning4j.nn.layers.normalization.CudnnBatchNormalizationHelper;
import org.deeplearning4j.nn.layers.normalization.CudnnLocalResponseNormalizationHelper;
import org.deeplearning4j.nn.layers.normalization.LocalResponseNormalizationHelper;
import org.deeplearning4j.nn.layers.recurrent.CudnnLSTMHelper;
import org.deeplearning4j.nn.layers.recurrent.LSTMHelper;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.deeplearning4j.nn.weights.WeightInit;
import org.junit.Test;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.buffer.util.DataTypeUtil;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.function.Consumer;
import org.nd4j.linalg.learning.config.NoOp;
import org.nd4j.linalg.lossfunctions.LossFunctions;

import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;

import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

/**
 * Created by Alex on 09/09/2016.
 */
@Slf4j
public class CuDNNGradientChecks extends BaseDL4JTest {

    private static final boolean PRINT_RESULTS = true;
    private static final boolean RETURN_ON_FIRST_FAILURE = false;
    private static final double DEFAULT_EPS = 1e-5;
    private static final double DEFAULT_MAX_REL_ERROR = 1e-2;
    private static final double DEFAULT_MIN_ABS_ERROR = 1e-6;

    static {
        DataTypeUtil.setDTypeForContext(DataType.DOUBLE);
    }


    @Test
    public void testConvolutional() throws Exception {

        //Parameterized test, testing combinations of:
        // (a) activation function
        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
        // (c) Loss function (with specified output activations)
        Activation[] activFns = {Activation.SIGMOID, Activation.TANH};
        boolean[] characteristic = {false, true}; //If true: run some backprop steps first

        int[] minibatchSizes = {1, 4};
        int width = 6;
        int height = 6;
        int inputDepth = 2;
        int nOut = 3;

        Field f = org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.class.getDeclaredField("helper");
        f.setAccessible(true);

        Random r = new Random(12345);
        for (Activation afn : activFns) {
            for (boolean doLearningFirst : characteristic) {
                for (int minibatchSize : minibatchSizes) {

                    INDArray input = Nd4j.rand(new int[] {minibatchSize, inputDepth, height, width});
                    INDArray labels = Nd4j.zeros(minibatchSize, nOut);
                    for (int i = 0; i < minibatchSize; i++) {
                        labels.putScalar(i, r.nextInt(nOut), 1.0);
                    }

                    MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                            .dataType(DataType.DOUBLE)
                                    .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT)
                                    .dist(new UniformDistribution(-1, 1))
                                    .updater(new NoOp()).seed(12345L).list()
                                    .layer(0, new ConvolutionLayer.Builder(2, 2).stride(2, 2).padding(1, 1).nOut(3)
                                                    .activation(afn).build())
                                    .layer(1, new ConvolutionLayer.Builder(2, 2).stride(2, 2).padding(0, 0).nOut(3)
                                                    .activation(afn).build())
                                    .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                                    .activation(Activation.SOFTMAX).nOut(nOut).build())
                                    .setInputType(InputType.convolutional(height, width, inputDepth))
                                    ;

                    MultiLayerConfiguration conf = builder.build();

                    MultiLayerNetwork mln = new MultiLayerNetwork(conf);
                    mln.init();

                    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c0 =
                                    (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) mln.getLayer(0);
                    ConvolutionHelper ch0 = (ConvolutionHelper) f.get(c0);
                    assertTrue(ch0 instanceof CudnnConvolutionHelper);

                    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c1 =
                                    (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) mln.getLayer(1);
                    ConvolutionHelper ch1 = (ConvolutionHelper) f.get(c1);
                    assertTrue(ch1 instanceof CudnnConvolutionHelper);

                    //-------------------------------
                    //For debugging/comparison to no-cudnn case: set helper field to null
                    //                    f.set(c0, null);
                    //                    f.set(c1, null);
                    //                    assertNull(f.get(c0));
                    //                    assertNull(f.get(c1));
                    //-------------------------------


                    String name = new Object() {}.getClass().getEnclosingMethod().getName();

                    if (doLearningFirst) {
                        //Run a number of iterations of learning
                        mln.setInput(input);
                        mln.setLabels(labels);
                        mln.computeGradientAndScore();
                        double scoreBefore = mln.score();
                        for (int j = 0; j < 10; j++)
                            mln.fit(input, labels);
                        mln.computeGradientAndScore();
                        double scoreAfter = mln.score();
                        //Can't test in 'characteristic mode of operation' if not learning
                        String msg = name + " - score did not (sufficiently) decrease during learning - activationFn="
                                        + afn + ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
                                        + ", scoreAfter=" + scoreAfter + ")";
                        assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
                    }

                    if (PRINT_RESULTS) {
                        System.out.println(name + " - activationFn=" + afn + ", doLearningFirst=" + doLearningFirst);
                        for (int j = 0; j < mln.getnLayers(); j++)
                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
                    }

                    boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);

                    assertTrue(gradOK);
                }
            }
        }
    }


    @Test
    public void testConvolutionalNoBias() throws Exception {
        int[] minibatchSizes = {1, 4};
        int width = 6;
        int height = 6;
        int inputDepth = 2;
        int nOut = 3;

        Field f = org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.class.getDeclaredField("helper");
        f.setAccessible(true);

        Random r = new Random(12345);
        for (int minibatchSize : minibatchSizes) {
            for (boolean convHasBias : new boolean[]{true, false}) {

                INDArray input = Nd4j.rand(new int[]{minibatchSize, inputDepth, height, width});
                INDArray labels = Nd4j.zeros(minibatchSize, nOut);
                for (int i = 0; i < minibatchSize; i++) {
                    labels.putScalar(i, r.nextInt(nOut), 1.0);
                }

                MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                        .dataType(DataType.DOUBLE)
                        .dist(new UniformDistribution(-1, 1))
                        .updater(new NoOp()).seed(12345L)
                        .list()
                        .layer(0, new ConvolutionLayer.Builder(2, 2).stride(2, 2).padding(1, 1).nOut(3)
                                .hasBias(convHasBias)
                                .activation(Activation.TANH).build())
                        .layer(1, new ConvolutionLayer.Builder(2, 2).stride(2, 2).padding(0, 0).nOut(3)
                                .hasBias(convHasBias)
                                .activation(Activation.TANH).build())
                        .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                .activation(Activation.SOFTMAX).nOut(nOut).build())
                        .setInputType(InputType.convolutional(height, width, inputDepth))
                        ;

                MultiLayerConfiguration conf = builder.build();

                MultiLayerNetwork mln = new MultiLayerNetwork(conf);
                mln.init();

                org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c0 =
                        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) mln.getLayer(0);
                ConvolutionHelper ch0 = (ConvolutionHelper) f.get(c0);
                assertTrue(ch0 instanceof CudnnConvolutionHelper);

                org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c1 =
                        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) mln.getLayer(1);
                ConvolutionHelper ch1 = (ConvolutionHelper) f.get(c1);
                assertTrue(ch1 instanceof CudnnConvolutionHelper);


                String name = new Object() {}.getClass().getEnclosingMethod().getName() + ", minibatch = "
                        + minibatchSize + ", convHasBias = " + convHasBias;

                if (PRINT_RESULTS) {
                    System.out.println(name);
                    for (int j = 0; j < mln.getnLayers(); j++)
                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
                }

                boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);

                assertTrue(name, gradOK);
            }
        }
    }

    @Test
    public void testBatchNormCnn() throws Exception {
        //Note: CuDNN batch norm supports 4d only, as per 5.1 (according to api reference documentation)
        Nd4j.getRandom().setSeed(12345);
        int minibatch = 10;
        int depth = 1;
        int hw = 4;
        int nOut = 4;
        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            labels.putScalar(i, r.nextInt(nOut), 1.0);
        }

        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().updater(new NoOp())
                        .dataType(DataType.DOUBLE)
                        .seed(12345L)
                        .dist(new NormalDistribution(0, 2)).list()
                        .layer(0, new ConvolutionLayer.Builder().kernelSize(2, 2).stride(1, 1).nIn(depth).nOut(2)
                                        .activation(Activation.IDENTITY).build())
                        .layer(1, new BatchNormalization.Builder().build())
                        .layer(2, new ActivationLayer.Builder().activation(Activation.TANH).build())
                        .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                        .activation(Activation.SOFTMAX).nOut(nOut).build())
                        .setInputType(InputType.convolutional(hw, hw, depth));

        MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
        mln.init();

        Field f = org.deeplearning4j.nn.layers.normalization.BatchNormalization.class.getDeclaredField("helper");
        f.setAccessible(true);

        org.deeplearning4j.nn.layers.normalization.BatchNormalization b =
                        (org.deeplearning4j.nn.layers.normalization.BatchNormalization) mln.getLayer(1);
        BatchNormalizationHelper bn = (BatchNormalizationHelper) f.get(b);
        assertTrue(bn instanceof CudnnBatchNormalizationHelper);

        //-------------------------------
        //For debugging/comparison to no-cudnn case: set helper field to null
        //        f.set(b, null);
        //        assertNull(f.get(b));
        //-------------------------------

        if (PRINT_RESULTS) {
            for (int j = 0; j < mln.getnLayers(); j++)
                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
        }

        //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
        //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
        //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
        Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "1_log10stdev"));
        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, excludeParams);

        assertTrue(gradOK);
    }

    @Test
    public void testLRN() throws Exception {

        Nd4j.getRandom().setSeed(12345);
        int minibatch = 10;
        int depth = 6;
        int hw = 5;
        int nOut = 4;
        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            labels.putScalar(i, r.nextInt(nOut), 1.0);
        }

        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().updater(new NoOp())
                        .dataType(DataType.DOUBLE)
                        .seed(12345L)
                        .dist(new NormalDistribution(0, 2)).list()
                        .layer(0, new ConvolutionLayer.Builder().nOut(6).kernelSize(2, 2).stride(1, 1)
                                        .activation(Activation.TANH).build())
                        .layer(1, new LocalResponseNormalization.Builder().build())
                        .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                        .activation(Activation.SOFTMAX).nOut(nOut).build())
                        .setInputType(InputType.convolutional(hw, hw, depth));

        MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
        mln.init();

        Field f = org.deeplearning4j.nn.layers.normalization.LocalResponseNormalization.class
                        .getDeclaredField("helper");
        f.setAccessible(true);

        org.deeplearning4j.nn.layers.normalization.LocalResponseNormalization l =
                        (org.deeplearning4j.nn.layers.normalization.LocalResponseNormalization) mln.getLayer(1);
        LocalResponseNormalizationHelper lrn = (LocalResponseNormalizationHelper) f.get(l);
        assertTrue(lrn instanceof CudnnLocalResponseNormalizationHelper);

        //-------------------------------
        //For debugging/comparison to no-cudnn case: set helper field to null
        //        f.set(l, null);
        //        assertNull(f.get(l));
        //-------------------------------

        if (PRINT_RESULTS) {
            for (int j = 0; j < mln.getnLayers(); j++)
                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
        }

        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);

        assertTrue(gradOK);
    }

    @Test
    public void testLSTM() throws Exception {

        Nd4j.getRandom().setSeed(12345);
        int minibatch = 4;
        int inputSize = 3;
        int lstmLayerSize = 4;
        int timeSeriesLength = 3;
        int nOut = 4;
        INDArray input = Nd4j.rand(new int[] {minibatch, inputSize, timeSeriesLength});
        INDArray labels = Nd4j.zeros(minibatch, nOut, timeSeriesLength);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            for (int j = 0; j < timeSeriesLength; j++) {
                labels.putScalar(i, r.nextInt(nOut), j, 1.0);
            }
        }

        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                        .dataType(DataType.DOUBLE)
                        .updater(new NoOp()).seed(12345L)
                        .dist(new NormalDistribution(0, 2)).list()
                        .layer(0, new LSTM.Builder().nIn(input.size(1)).nOut(lstmLayerSize)
                                        .gateActivationFunction(Activation.SIGMOID).activation(Activation.TANH).build())
                        .layer(1, new LSTM.Builder().nIn(lstmLayerSize).nOut(lstmLayerSize)
                                        .gateActivationFunction(Activation.SIGMOID).activation(Activation.TANH).build())
                        .layer(2, new RnnOutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                        .activation(Activation.SOFTMAX).nIn(lstmLayerSize).nOut(nOut).build())
                        ;

        MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
        mln.init();

        Field f = org.deeplearning4j.nn.layers.recurrent.LSTM.class.getDeclaredField("helper");
        f.setAccessible(true);

        org.deeplearning4j.nn.layers.recurrent.LSTM l = (org.deeplearning4j.nn.layers.recurrent.LSTM) mln.getLayer(1);
        LSTMHelper helper = (LSTMHelper) f.get(l);
        assertTrue(helper instanceof CudnnLSTMHelper);

        //-------------------------------
        //For debugging/comparison to no-cudnn case: set helper field to null
        //        f.set(l, null);
        //        assertNull(f.get(l));
        //-------------------------------

        if (PRINT_RESULTS) {
            for (int j = 0; j < mln.getnLayers(); j++)
                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
        }

        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 32);

        assertTrue(gradOK);
    }


    @Test
    public void testLSTM2() throws Exception {

        Nd4j.getRandom().setSeed(12345);
        int minibatch = 10;
        int inputSize = 3;
        int lstmLayerSize = 4;
        int timeSeriesLength = 3;
        int nOut = 2;
        INDArray input = Nd4j.rand(new int[] {minibatch, inputSize, timeSeriesLength});
        INDArray labels = Nd4j.zeros(minibatch, nOut, timeSeriesLength);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            for (int j = 0; j < timeSeriesLength; j++) {
                labels.putScalar(i, r.nextInt(nOut), j, 1.0);
            }
        }

        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                        .dataType(DataType.DOUBLE)
                        .updater(new NoOp()).seed(12345L)
                        .dist(new NormalDistribution(0, 2)).list()
                        .layer(0, new LSTM.Builder().nIn(input.size(1)).nOut(lstmLayerSize)
                                        .gateActivationFunction(Activation.SIGMOID).activation(Activation.TANH).build())
                        .layer(1, new LSTM.Builder().nIn(lstmLayerSize).nOut(lstmLayerSize)
                                        .gateActivationFunction(Activation.SIGMOID).activation(Activation.TANH).build())
                        .layer(2, new RnnOutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                                        .activation(Activation.SOFTMAX).nIn(lstmLayerSize).nOut(nOut).build())
                        ;

        MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
        mln.init();

        Field f = org.deeplearning4j.nn.layers.recurrent.LSTM.class.getDeclaredField("helper");
        f.setAccessible(true);

        org.deeplearning4j.nn.layers.recurrent.LSTM l = (org.deeplearning4j.nn.layers.recurrent.LSTM) mln.getLayer(1);
        LSTMHelper helper = (LSTMHelper) f.get(l);
        assertTrue(helper instanceof CudnnLSTMHelper);

        //-------------------------------
        //For debugging/comparison to no-cudnn case: set helper field to null
        //        f.set(l, null);
        //        assertNull(f.get(l));
        //-------------------------------

        if (PRINT_RESULTS) {
            for (int j = 0; j < mln.getnLayers(); j++)
                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
        }

        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);

        assertTrue(gradOK);
    }


    @Test
    public void testCnnDilated() throws Exception {
        int nOut = 2;

        int minibatchSize = 3;
        int width = 8;
        int height = 8;
        int inputDepth = 3;


        Nd4j.getRandom().setSeed(12345);

        Field f = org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.class.getDeclaredField("helper");
        f.setAccessible(true);

        Field f2 = org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer.class.getDeclaredField("helper");
        f2.setAccessible(true);

        int[] kernelSizes = new int[]{2, 3, 2};
        int[] strides = {1, 2, 2};
        int[] dilation = {2, 3, 2};
        ConvolutionMode[] cModes = new ConvolutionMode[]{ConvolutionMode.Truncate, ConvolutionMode.Same, ConvolutionMode.Truncate};

        for (boolean subsampling : new boolean[]{false, true}) {
            for (int t = 0; t < kernelSizes.length; t++) {
                int k = kernelSizes[t];
                int s = strides[t];
                int d = dilation[t];
                ConvolutionMode cm = cModes[t];

                //Use larger input with larger dilation values (to avoid invalid config)
                int w = d * width;
                int h = d * height;

                INDArray input = Nd4j.rand(minibatchSize, w * h * inputDepth);
                INDArray labels = Nd4j.zeros(minibatchSize, nOut);
                for (int i = 0; i < minibatchSize; i++) {
                    labels.putScalar(new int[]{i, i % nOut}, 1.0);
                }

                NeuralNetConfiguration.ListBuilder b = new NeuralNetConfiguration.Builder().seed(12345)
                        .dataType(DataType.DOUBLE)
                        .updater(new NoOp())
                        .activation(Activation.TANH).convolutionMode(cm).list()
                        .layer(new ConvolutionLayer.Builder().name("layer 0")
                                .kernelSize(k, k)
                                .stride(s, s)
                                .dilation(d, d)
                                .nIn(inputDepth).nOut(2).build());
                if (subsampling) {
                    b.layer(new SubsamplingLayer.Builder()
                            .poolingType(SubsamplingLayer.PoolingType.MAX)
                            .kernelSize(k, k)
                            .stride(s, s)
                            .dilation(d, d)
                            .build());
                } else {
                    b.layer(new ConvolutionLayer.Builder().nIn(2).nOut(2)
                            .kernelSize(k, k)
                            .stride(s, s)
                            .dilation(d, d)
                            .build());
                }

                MultiLayerConfiguration conf = b.layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                        .activation(Activation.SOFTMAX).nOut(nOut).build())
                        .setInputType(InputType.convolutionalFlat(h, w, inputDepth)).build();

                MultiLayerNetwork net = new MultiLayerNetwork(conf);
                net.init();

                org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c0 =
                        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) net.getLayer(0);
                ConvolutionHelper ch0 = (ConvolutionHelper) f.get(c0);
                assertTrue(ch0 instanceof CudnnConvolutionHelper);

                if (subsampling) {
                    org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer s1 =
                            (org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer) net.getLayer(1);
                    SubsamplingHelper sh1 = (SubsamplingHelper) f2.get(s1);
                    assertTrue(sh1 instanceof SubsamplingHelper);
                } else {
                    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c1 =
                            (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) net.getLayer(1);
                    ConvolutionHelper ch1 = (ConvolutionHelper) f.get(c1);
                    assertTrue(ch1 instanceof CudnnConvolutionHelper);
                }

                for (int i = 0; i < net.getLayers().length; i++) {
                    System.out.println("nParams, layer " + i + ": " + net.getLayer(i).numParams());
                }

                String msg = (subsampling ? "subsampling" : "conv") + " - mb=" + minibatchSize + ", k="
                        + k + ", s=" + s + ", d=" + d + ", cm=" + cm;
                System.out.println(msg);

                boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);

                assertTrue(msg, gradOK);
            }
        }
    }


    @Test
    public void testDropout() {
        int minibatch = 2;

        for (boolean cnn : new boolean[]{false, true}) {
            Nd4j.getRandom().setSeed(12345);
            IDropout dropout = new Dropout(0.6);

            NeuralNetConfiguration.ListBuilder builder = new NeuralNetConfiguration.Builder()
                    .seed(12345)
                    .dataType(DataType.DOUBLE)
                    .dist(new NormalDistribution(0, 1))
                    .convolutionMode(ConvolutionMode.Same)
                    .dropOut(dropout)
                    .activation(Activation.TANH)
                    .updater(new NoOp())
                    .list();

            if (cnn) {
                builder.layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(2, 2).nOut(2).build());
                builder.layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(2, 2).nOut(2).build());
                builder.setInputType(InputType.convolutional(8, 8, 2));
            } else {
                builder.layer(new DenseLayer.Builder().nOut(8).build());
                builder.layer(new DenseLayer.Builder().nOut(8).build());
                builder.setInputType(InputType.feedForward(6));
            }
            builder.layer(new OutputLayer.Builder().nOut(3).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build());
            MultiLayerConfiguration conf = builder.build();

            MultiLayerNetwork mln = new MultiLayerNetwork(conf);
            mln.init();

            INDArray f;
            if (cnn) {
                f = Nd4j.rand(new int[]{minibatch, 2, 8, 8}).muli(10).subi(5);
            } else {
                f = Nd4j.rand(minibatch, 6).muli(10).subi(5);
            }
            INDArray l = TestUtils.randomOneHot(minibatch, 3);

            mln.output(f, true);

            for (Layer layer : mln.getLayers()) {
                Dropout d = (Dropout) layer.conf().getLayer().getIDropout();
                assertNotNull(d);
                CudnnDropoutHelper h = (CudnnDropoutHelper) d.getHelper();
                assertNotNull(h);
            }

            String msg = (cnn ? "CNN" : "Dense") + ": " + dropout.getClass().getSimpleName();

            //Consumer function to enforce CuDNN RNG repeatability - otherwise will fail due to randomness (inconsistent
            // dropout mask between forward passes)
            Consumer<MultiLayerNetwork> c = new Consumer<MultiLayerNetwork>() {
                @Override
                public void accept(MultiLayerNetwork net) {
                    Nd4j.getRandom().setSeed(12345);
                    for(Layer l : net.getLayers()){
                        Dropout d = (Dropout) l.conf().getLayer().getIDropout();
                        if(d != null){
                            ((CudnnDropoutHelper)d.getHelper()).setMask(null);
                            ((CudnnDropoutHelper)d.getHelper()).setRngStates(null);
                        }
                    }
                }
            };

            log.info("*** Starting test: " + msg + " ***");
            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l, null, null,
                    false, -1, null, c);

            assertTrue(msg, gradOK);
            TestUtils.testModelSerialization(mln);
        }
    }


    @Test
    public void testDenseBatchNorm(){


        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
                .dataType(DataType.DOUBLE)
                .seed(12345)
                .weightInit(WeightInit.XAVIER)
                .updater(new NoOp())
                .list()
                .layer(new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build())
                .layer(new BatchNormalization.Builder().nOut(5).build())
                .layer(new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build())
                .build();

        MultiLayerNetwork net = new MultiLayerNetwork(conf);
        net.init();

        INDArray in = Nd4j.rand(3, 5);
        INDArray labels = TestUtils.randomOneHot(3, 5);

        //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
        //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
        //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
        Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "1_log10stdev"));
        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, excludeParams);

        assertTrue(gradOK);

        TestUtils.testModelSerialization(net);
    }
}