First round of runtime test improvements (#7875)

* Capsnet test runtime improvements * Slow test speedups * Next round of test speed improvements * More test improvements * Improve test speed * Next round of test speedups * Another round * More test speedups * Another round * Another round of test speedups * Another round of speedups... * CuDNN test speedups + more tests extending BaseDL4JTest * Minor fix + more BaseDL4JTest use in other modules
2019-06-13 20:40:40 +10:00 · 2019-06-13 20:40:40 +10:00 · 32e5cc1945
commit 32e5cc1945
parent b5f0ec072f
188 changed files with 2558 additions and 1531 deletions
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/AttentionLayerTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/AttentionLayerTest.java
@ -17,12 +17,14 @@
 package org.deeplearning4j.gradientcheck;

 import org.deeplearning4j.BaseDL4JTest;
+import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 import org.deeplearning4j.nn.conf.graph.AttentionVertex;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.weights.WeightInit;
@ -44,7 +46,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Rule
    public ExpectedException exceptionRule = ExpectedException.none();

-    private static final boolean PRINT_RESULTS = false;
+    private static final boolean PRINT_RESULTS = true;
    private static final boolean RETURN_ON_FIRST_FAILURE = false;
    private static final double DEFAULT_EPS = 1e-6;
    private static final double DEFAULT_MAX_REL_ERROR = 1e-3;
@ -53,19 +55,15 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Test
    public void testSelfAttentionLayer() {
        int nIn = 3;
-        int nOut = 5;
+        int nOut = 2;
        int tsLength = 4;
-        int layerSize = 8;
+        int layerSize = 4;

-        Random r = new Random(12345);
-        for (int mb : new int[]{1, 2, 3}) {
+        for (int mb : new int[]{1, 3}) {
            for (boolean inputMask : new boolean[]{false, true}) {
                for (boolean projectInput : new boolean[]{false, true}) {
-                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                    INDArray labels = Nd4j.create(mb, nOut);
-                    for (int i = 0; i < mb; i++) {
-                        labels.putScalar(i, r.nextInt(nOut), 1.0);
-                    }
+                    INDArray in = Nd4j.rand(DataType.DOUBLE, new int[]{mb, nIn, tsLength});
+                    INDArray labels = TestUtils.randomOneHot(mb, nOut);
                    String maskType = (inputMask ? "inputMask" : "none");

                    INDArray inMask = null;
@ -94,7 +92,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                            .list()
                            .layer(new LSTM.Builder().nOut(layerSize).build())
                            .layer( projectInput ?
-                                            new SelfAttentionLayer.Builder().nOut(8).nHeads(2).projectInput(true).build()
+                                            new SelfAttentionLayer.Builder().nOut(4).nHeads(2).projectInput(true).build()
                                            : new SelfAttentionLayer.Builder().nHeads(1).projectInput(false).build()
                                    )
                            .layer(new GlobalPoolingLayer.Builder().poolingType(PoolingType.MAX).build())
@ -107,7 +105,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                    net.init();

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, true, 100);
                    assertTrue(name, gradOK);
                }
            }
@ -117,20 +115,16 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Test
    public void testLearnedSelfAttentionLayer() {
        int nIn = 3;
-        int nOut = 5;
+        int nOut = 2;
        int tsLength = 4;
-        int layerSize = 8;
-        int numQueries = 6;
+        int layerSize = 4;
+        int numQueries = 3;

-        Random r = new Random(12345);
        for (boolean inputMask : new boolean[]{false, true}) {
-            for (int mb : new int[]{3, 2, 1}) {
+            for (int mb : new int[]{3, 1}) {
                for (boolean projectInput : new boolean[]{false, true}) {
-                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                    INDArray labels = Nd4j.create(mb, nOut);
-                    for (int i = 0; i < mb; i++) {
-                        labels.putScalar(i, r.nextInt(nOut), 1.0);
-                    }
+                    INDArray in = Nd4j.rand(DataType.DOUBLE, new int[]{mb, nIn, tsLength});
+                    INDArray labels = TestUtils.randomOneHot(mb, nOut);
                    String maskType = (inputMask ? "inputMask" : "none");

                    INDArray inMask = null;
@ -159,7 +153,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                            .list()
                            .layer(new LSTM.Builder().nOut(layerSize).build())
                            .layer( projectInput ?
-                                    new LearnedSelfAttentionLayer.Builder().nOut(8).nHeads(2).nQueries(numQueries).projectInput(true).build()
+                                    new LearnedSelfAttentionLayer.Builder().nOut(4).nHeads(2).nQueries(numQueries).projectInput(true).build()
                                    : new LearnedSelfAttentionLayer.Builder().nHeads(1).nQueries(numQueries).projectInput(false).build()
                            )
                            .layer(new GlobalPoolingLayer.Builder().poolingType(PoolingType.MAX).build())
@ -172,7 +166,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                    net.init();

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, true, 100);
                    assertTrue(name, gradOK);
                }
            }
@ -182,10 +176,10 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Test
    public void testLearnedSelfAttentionLayer_differentMiniBatchSizes() {
        int nIn = 3;
-        int nOut = 5;
+        int nOut = 2;
        int tsLength = 4;
-        int layerSize = 8;
-        int numQueries = 6;
+        int layerSize = 4;
+        int numQueries = 3;

        Random r = new Random(12345);
        for (boolean inputMask : new boolean[]{false, true}) {
@ -199,7 +193,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                    .list()
                    .layer(new LSTM.Builder().nOut(layerSize).build())
                    .layer( projectInput ?
-                            new LearnedSelfAttentionLayer.Builder().nOut(8).nHeads(2).nQueries(numQueries).projectInput(true).build()
+                            new LearnedSelfAttentionLayer.Builder().nOut(4).nHeads(2).nQueries(numQueries).projectInput(true).build()
                            : new LearnedSelfAttentionLayer.Builder().nHeads(1).nQueries(numQueries).projectInput(false).build()
                    )
                    .layer(new GlobalPoolingLayer.Builder().poolingType(PoolingType.MAX).build())
@ -210,17 +204,14 @@ public class AttentionLayerTest extends BaseDL4JTest {

            MultiLayerNetwork net = new MultiLayerNetwork(conf);
            net.init();
-            for (int mb : new int[]{3, 2, 1}) {
-                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                    INDArray labels = Nd4j.create(mb, nOut);
-                    for (int i = 0; i < mb; i++) {
-                        labels.putScalar(i, r.nextInt(nOut), 1.0);
-                    }
+            for (int mb : new int[]{3, 1}) {
+                    INDArray in = Nd4j.rand(DataType.DOUBLE, new int[]{mb, nIn, tsLength});
+                    INDArray labels = TestUtils.randomOneHot(mb, nOut);
                    String maskType = (inputMask ? "inputMask" : "none");

                    INDArray inMask = null;
                    if (inputMask) {
-                        inMask = Nd4j.ones(mb, tsLength);
+                        inMask = Nd4j.ones(DataType.INT, mb, tsLength);
                        for (int i = 0; i < mb; i++) {
                            int firstMaskedStep = tsLength - 1 - i;
                            if (firstMaskedStep == 0) {
@ -236,7 +227,7 @@ public class AttentionLayerTest extends BaseDL4JTest {
                    System.out.println("Starting test: " + name);

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, true, 100);
                    assertTrue(name, gradOK);
                }
            }
@ -282,20 +273,15 @@ public class AttentionLayerTest extends BaseDL4JTest {

    @Test
    public void testRecurrentAttentionLayer() {
-        int nIn = 9;
-        int nOut = 5;
-        int tsLength = 4;
-        int layerSize = 8;
+        int nIn = 4;
+        int nOut = 2;
+        int tsLength = 3;
+        int layerSize = 3;

-
-        Random r = new Random(12345);
-        for (int mb : new int[]{3, 2, 1}) {
+        for (int mb : new int[]{3, 1}) {
            for (boolean inputMask : new boolean[]{true, false}) {
-                INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                INDArray labels = Nd4j.create(mb, nOut);
-                for (int i = 0; i < mb; i++) {
-                    labels.putScalar(i, r.nextInt(nOut), 1.0);
-                }
+                INDArray in = Nd4j.rand(DataType.DOUBLE, new int[]{mb, nIn, tsLength});
+                INDArray labels = TestUtils.randomOneHot(mb, nOut);
                String maskType = (inputMask ? "inputMask" : "none");

                INDArray inMask = null;
@ -335,8 +321,7 @@ public class AttentionLayerTest extends BaseDL4JTest {

                //System.out.println("Original");
                boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, false, -1, null
-                );
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, true, 100, null);
                assertTrue(name, gradOK);
            }
        }
@ -345,19 +330,16 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Test
    public void testAttentionVertex() {
        int nIn = 3;
-        int nOut = 5;
-        int tsLength = 4;
-        int layerSize = 8;
+        int nOut = 2;
+        int tsLength = 3;
+        int layerSize = 3;

        Random r = new Random(12345);
        for (boolean inputMask : new boolean[]{false, true}) {
-            for (int mb : new int[]{3, 2, 1}) {
+            for (int mb : new int[]{3, 1}) {
                for (boolean projectInput : new boolean[]{false, true}) {
-                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                    INDArray labels = Nd4j.create(mb, nOut);
-                    for (int i = 0; i < mb; i++) {
-                        labels.putScalar(i, r.nextInt(nOut), 1.0);
-                    }
+                    INDArray in = Nd4j.rand(DataType.DOUBLE, new int[]{mb, nIn, tsLength});
+                    INDArray labels = TestUtils.randomOneHot(mb, nOut);
                    String maskType = (inputMask ? "inputMask" : "none");

                    INDArray inMask = null;
@ -385,13 +367,13 @@ public class AttentionLayerTest extends BaseDL4JTest {
                            .weightInit(WeightInit.XAVIER)
                            .graphBuilder()
                            .addInputs("input")
-                            .addLayer("lstmKeys", new LSTM.Builder().nOut(layerSize).build(), "input")
-                            .addLayer("lstmQueries", new LSTM.Builder().nOut(layerSize).build(), "input")
-                            .addLayer("lstmValues", new LSTM.Builder().nOut(layerSize).build(), "input")
+                            .addLayer("rnnKeys", new SimpleRnn.Builder().nOut(layerSize).build(), "input")
+                            .addLayer("rnnQueries", new SimpleRnn.Builder().nOut(layerSize).build(), "input")
+                            .addLayer("rnnValues", new SimpleRnn.Builder().nOut(layerSize).build(), "input")
                            .addVertex("attention",
                                    projectInput ?
-                                    new AttentionVertex.Builder().nOut(8).nHeads(2).projectInput(true).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build()
-                                            :  new AttentionVertex.Builder().nOut(8).nHeads(1).projectInput(false).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build(), "lstmQueries", "lstmKeys", "lstmValues")
+                                    new AttentionVertex.Builder().nOut(4).nHeads(2).projectInput(true).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build()
+                                            :  new AttentionVertex.Builder().nOut(3).nHeads(1).projectInput(false).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build(), "rnnQueries", "rnnKeys", "rnnValues")
                            .addLayer("pooling", new GlobalPoolingLayer.Builder().poolingType(PoolingType.MAX).build(), "attention")
                            .addLayer("output", new OutputLayer.Builder().nOut(nOut).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build(), "pooling")
                            .setOutputs("output")
@ -412,19 +394,16 @@ public class AttentionLayerTest extends BaseDL4JTest {
    @Test
    public void testAttentionVertexSameInput() {
        int nIn = 3;
-        int nOut = 5;
+        int nOut = 2;
        int tsLength = 4;
-        int layerSize = 8;
+        int layerSize = 4;

        Random r = new Random(12345);
        for (boolean inputMask : new boolean[]{false, true}) {
-            for (int mb : new int[]{3, 2, 1}) {
+            for (int mb : new int[]{3, 1}) {
                for (boolean projectInput : new boolean[]{false, true}) {
                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
-                    INDArray labels = Nd4j.create(mb, nOut);
-                    for (int i = 0; i < mb; i++) {
-                        labels.putScalar(i, r.nextInt(nOut), 1.0);
-                    }
+                    INDArray labels = TestUtils.randomOneHot(mb, nOut);
                    String maskType = (inputMask ? "inputMask" : "none");

                    INDArray inMask = null;
@ -452,11 +431,11 @@ public class AttentionLayerTest extends BaseDL4JTest {
                            .weightInit(WeightInit.XAVIER)
                            .graphBuilder()
                            .addInputs("input")
-                            .addLayer("lstm", new LSTM.Builder().nOut(layerSize).build(), "input")
+                            .addLayer("rnn", new SimpleRnn.Builder().activation(Activation.TANH).nOut(layerSize).build(), "input")
                            .addVertex("attention",
                                    projectInput ?
-                                            new AttentionVertex.Builder().nOut(8).nHeads(2).projectInput(true).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build()
-                                            :  new AttentionVertex.Builder().nOut(8).nHeads(1).projectInput(false).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build(), "lstm", "lstm", "lstm")
+                                            new AttentionVertex.Builder().nOut(4).nHeads(2).projectInput(true).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build()
+                                            :  new AttentionVertex.Builder().nOut(4).nHeads(1).projectInput(false).nInQueries(layerSize).nInKeys(layerSize).nInValues(layerSize).build(), "rnn", "rnn", "rnn")
                            .addLayer("pooling", new GlobalPoolingLayer.Builder().poolingType(PoolingType.MAX).build(), "attention")
                            .addLayer("output", new OutputLayer.Builder().nOut(nOut).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build(), "pooling")
                            .setOutputs("output")
@ -467,7 +446,8 @@ public class AttentionLayerTest extends BaseDL4JTest {
                    net.init();

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, new INDArray[]{in}, new INDArray[]{labels}, inMask != null ? new INDArray[]{inMask} : null, null);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, new INDArray[]{in},
+                            new INDArray[]{labels}, inMask != null ? new INDArray[]{inMask} : null, null);
                    assertTrue(name, gradOK);
                }
            }
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java
@ -76,7 +76,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        INDArray input = ds.getFeatures();
        INDArray labels = ds.getLabels();

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {

            MultiLayerConfiguration.Builder builder =
                    new NeuralNetConfiguration.Builder().updater(new NoOp())
@ -117,14 +117,14 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        int depth = 1;
        int hw = 4;
        int nOut = 4;
-        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
+        INDArray input = Nd4j.rand(new int[]{minibatch, depth, hw, hw});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            labels.putScalar(i, r.nextInt(nOut), 1.0);
        }

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {
            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                    .dataType(DataType.DOUBLE)
                    .updater(new NoOp()).seed(12345L)
@ -158,20 +158,14 @@ public class BNGradientCheckTest extends BaseDL4JTest {
    }

    @Test
-    public void testGradientBNWithCNNandSubsamplingcCnfigurableProfiler() {
-
-        Nd4j.getExecutioner().setProfilingConfig(ProfilerConfig.builder()
-                .notOptimalArguments(true)
-                .notOptimalTAD(true)
-                .checkForINF(true)
-                .checkForNAN(true)
-                .checkElapsedTime(true)
-                .stackTrace(true)
-                .checkWorkspaces(true)
-                .build());
-
+    public void testGradientBNWithCNNandSubsampling() {
+        //Parameterized test, testing combinations of:
+        // (a) activation function
+        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
+        // (c) Loss function (with specified output activations)
+        // (d) l1 and l2 values
        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
+        boolean[] characteristic = {true}; //If true: run some backprop steps first

        LossFunctions.LossFunction[] lossFunctions =
                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
@ -181,24 +175,24 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]

        Nd4j.getRandom().setSeed(12345);
-        int minibatch = 10;
+        int minibatch = 4;
        int depth = 2;
        int hw = 5;
-        int nOut = 3;
-        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw}).muli(5).subi(2.5);
-        INDArray labels = Nd4j.zeros(minibatch, nOut);
-        Random r = new Random(12345);
-        for (int i = 0; i < minibatch; i++) {
-            labels.putScalar(i, r.nextInt(nOut), 1.0);
-        }
+        int nOut = 2;
+        INDArray input = Nd4j.rand(new int[]{minibatch, depth, hw, hw}).muli(5).subi(2.5);
+        INDArray labels = TestUtils.randomOneHot(minibatch, nOut);

        DataSet ds = new DataSet(input, labels);
-
-        for(boolean useLogStd : new boolean[]{true, false}) {
+        Random rng = new Random(12345);
+        for (boolean useLogStd : new boolean[]{true, false}) {
            for (Activation afn : activFns) {
                for (boolean doLearningFirst : characteristic) {
                    for (int i = 0; i < lossFunctions.length; i++) {
                        for (int j = 0; j < l2vals.length; j++) {
+                            //Skip 2 of every 3 tests: from 24 cases to 8, still with decent coverage
+                            if (rng.nextInt(3) != 0)
+                                continue;
+
                            LossFunctions.LossFunction lf = lossFunctions[i];
                            Activation outputActivation = outputActivations[i];

@ -260,7 +254,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
                            //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
                            Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "3_mean", "3_var", "1_log10stdev", "3_log10stdev"));
                            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, excludeParams);
+                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 25, excludeParams); //Most params are in output layer, only these should be skipped with this threshold

                            assertTrue(gradOK);
                            TestUtils.testModelSerialization(mln);
@ -269,117 +263,6 @@ public class BNGradientCheckTest extends BaseDL4JTest {
                }
            }
        }
-        OpProfiler.getInstance().printOutDashboard();
-    }
-
-    @Test
-    public void testGradientBNWithCNNandSubsampling() {
-        Nd4j.getExecutioner().setProfilingMode(OpExecutioner.ProfilingMode.NAN_PANIC);
-        //Parameterized test, testing combinations of:
-        // (a) activation function
-        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
-        // (c) Loss function (with specified output activations)
-        // (d) l1 and l2 values
-        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
-
-        LossFunctions.LossFunction[] lossFunctions =
-                        {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
-        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here
-
-        double[] l2vals = {0.0, 0.1, 0.1};
-        double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
-
-        Nd4j.getRandom().setSeed(12345);
-        int minibatch = 10;
-        int depth = 2;
-        int hw = 5;
-        int nOut = 3;
-        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw}).muli(5).subi(2.5);
-        INDArray labels = Nd4j.zeros(minibatch, nOut);
-        Random r = new Random(12345);
-        for (int i = 0; i < minibatch; i++) {
-            labels.putScalar(i, r.nextInt(nOut), 1.0);
-        }
-
-        DataSet ds = new DataSet(input, labels);
-
-        for(boolean useLogStd : new boolean[]{true, false}) {
-            for (Activation afn : activFns) {
-                for (boolean doLearningFirst : characteristic) {
-                    for (int i = 0; i < lossFunctions.length; i++) {
-                        for (int j = 0; j < l2vals.length; j++) {
-                            LossFunctions.LossFunction lf = lossFunctions[i];
-                            Activation outputActivation = outputActivations[i];
-
-                            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().seed(12345)
-                                    .dataType(DataType.DOUBLE)
-                                    .l2(l2vals[j])
-                                    .optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
-                                    .updater(new NoOp())
-                                    .dist(new UniformDistribution(-2, 2)).seed(12345L).list()
-                                    .layer(0, new ConvolutionLayer.Builder(2, 2).stride(1, 1).nOut(3)
-                                            .activation(afn).build())
-                                    .layer(1, new BatchNormalization.Builder().useLogStd(useLogStd).build())
-                                    .layer(2, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
-                                            .kernelSize(2, 2).stride(1, 1).build())
-                                    .layer(3, new BatchNormalization())
-                                    .layer(4, new ActivationLayer.Builder().activation(afn).build())
-                                    .layer(5, new OutputLayer.Builder(lf).activation(outputActivation).nOut(nOut)
-                                            .build())
-                                    .setInputType(InputType.convolutional(hw, hw, depth));
-
-                            MultiLayerConfiguration conf = builder.build();
-
-                            MultiLayerNetwork mln = new MultiLayerNetwork(conf);
-                            mln.init();
-                            String name = new Object() {
-                            }.getClass().getEnclosingMethod().getName();
-
-                            System.out.println("Num params: " + mln.numParams());
-
-                            if (doLearningFirst) {
-                                //Run a number of iterations of learning
-                                mln.setInput(ds.getFeatures());
-                                mln.setLabels(ds.getLabels());
-                                mln.computeGradientAndScore();
-                                double scoreBefore = mln.score();
-                                for (int k = 0; k < 20; k++)
-                                    mln.fit(ds);
-                                mln.computeGradientAndScore();
-                                double scoreAfter = mln.score();
-                                //Can't test in 'characteristic mode of operation' if not learning
-                                String msg = name
-                                        + " - score did not (sufficiently) decrease during learning - activationFn="
-                                        + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
-                                        + ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
-                                        + ", scoreAfter=" + scoreAfter + ")";
-                                assertTrue(msg, scoreAfter < 0.9 * scoreBefore);
-                            }
-
-                            if (PRINT_RESULTS) {
-                                System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
-                                        + ", outputActivation=" + outputActivation + ", doLearningFirst="
-                                        + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
-                                for (int k = 0; k < mln.getnLayers(); k++)
-                                    System.out.println("Layer " + k + " # params: " + mln.getLayer(k).numParams());
-                            }
-
-                            //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
-                            //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
-                            //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
-                            Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "3_mean", "3_var", "1_log10stdev", "3_log10stdev"));
-                            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, excludeParams);
-
-                            assertTrue(gradOK);
-                            TestUtils.testModelSerialization(mln);
-                        }
-                    }
-                }
-            }
-        }
-        OpProfiler.getInstance().printOutDashboard();
    }


@ -390,21 +273,21 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
        // (c) Loss function (with specified output activations)
        // (d) l1 and l2 values
-        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
+        Activation[] activFns = {Activation.TANH, Activation.IDENTITY};
+        boolean[] characteristic = {true}; //If true: run some backprop steps first

        LossFunctions.LossFunction[] lossFunctions =
-                        {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
+                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here

-        double[] l2vals = {0.0, 0.1, 0.1};
-        double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
+        double[] l2vals = {0.0, 0.1};
+        double[] l1vals = {0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]

        Nd4j.getRandom().setSeed(12345);
        int minibatch = 10;
        int nIn = 5;
        int nOut = 3;
-        INDArray input = Nd4j.rand(new int[] {minibatch, nIn});
+        INDArray input = Nd4j.rand(new int[]{minibatch, nIn});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
@ -413,7 +296,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {

        DataSet ds = new DataSet(input, labels);

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {
            for (Activation afn : activFns) {
                for (boolean doLearningFirst : characteristic) {
                    for (int i = 0; i < lossFunctions.length; i++) {
@ -498,7 +381,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        INDArray input = ds.getFeatures();
        INDArray labels = ds.getLabels();

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {
            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().updater(new NoOp())
                    .dataType(DataType.DOUBLE)
                    .seed(12345L)
@ -537,14 +420,14 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        int depth = 1;
        int hw = 4;
        int nOut = 4;
-        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
+        INDArray input = Nd4j.rand(new int[]{minibatch, depth, hw, hw});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
            labels.putScalar(i, r.nextInt(nOut), 1.0);
        }

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {
            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().updater(new NoOp())
                    .dataType(DataType.DOUBLE)
                    .seed(12345L)
@ -588,7 +471,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {

        int minibatchSize = 3;

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {

            ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(seed).updater(new NoOp())
                    .dataType(DataType.DOUBLE)
@ -630,22 +513,21 @@ public class BNGradientCheckTest extends BaseDL4JTest {
        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
        // (c) Loss function (with specified output activations)
        // (d) l1 and l2 values
-        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
+        Activation[] activFns = {Activation.TANH, Activation.IDENTITY};
+        boolean doLearningFirst = true;

-        LossFunctions.LossFunction[] lossFunctions =
-                        {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
-        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here
+        LossFunctions.LossFunction[] lossFunctions = {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD};
+        Activation[] outputActivations = {Activation.SOFTMAX}; //i.e., lossFunctions[i] used with outputActivations[i] here

-        double[] l2vals = {0.0, 0.1, 0.1};
-        double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
+        double[] l2vals = {0.0, 0.1};
+        double[] l1vals = {0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]

        Nd4j.getRandom().setSeed(12345);
        int minibatch = 10;
        int depth = 2;
        int hw = 5;
        int nOut = 3;
-        INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
+        INDArray input = Nd4j.rand(new int[]{minibatch, depth, hw, hw});
        INDArray labels = Nd4j.zeros(minibatch, nOut);
        Random r = new Random(12345);
        for (int i = 0; i < minibatch; i++) {
@ -654,75 +536,73 @@ public class BNGradientCheckTest extends BaseDL4JTest {

        DataSet ds = new DataSet(input, labels);

-        for(boolean useLogStd : new boolean[]{true, false}) {
+        for (boolean useLogStd : new boolean[]{true, false}) {
            for (Activation afn : activFns) {
-                for (boolean doLearningFirst : characteristic) {
-                    for (int i = 0; i < lossFunctions.length; i++) {
-                        for (int j = 0; j < l2vals.length; j++) {
-                            LossFunctions.LossFunction lf = lossFunctions[i];
-                            Activation outputActivation = outputActivations[i];
+                for (int i = 0; i < lossFunctions.length; i++) {
+                    for (int j = 0; j < l2vals.length; j++) {
+                        LossFunctions.LossFunction lf = lossFunctions[i];
+                        Activation outputActivation = outputActivations[i];

-                            ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
-                                    .dataType(DataType.DOUBLE)
-                                    .optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
-                                    .updater(new NoOp())
-                                    .dist(new UniformDistribution(-2, 2)).seed(12345L).graphBuilder()
-                                    .addInputs("in")
-                                    .addLayer("0", new ConvolutionLayer.Builder(2, 2).stride(1, 1).nOut(3)
-                                            .activation(afn).build(), "in")
-                                    .addLayer("1", new BatchNormalization.Builder().useLogStd(useLogStd).build(), "0")
-                                    .addLayer("2", new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
-                                            .kernelSize(2, 2).stride(1, 1).build(), "1")
-                                    .addLayer("3", new BatchNormalization.Builder().useLogStd(useLogStd).build(), "2")
-                                    .addLayer("4", new ActivationLayer.Builder().activation(afn).build(), "3")
-                                    .addLayer("5", new OutputLayer.Builder(lf).activation(outputActivation)
-                                            .nOut(nOut).build(), "4")
-                                    .setOutputs("5").setInputTypes(InputType.convolutional(hw, hw, depth))
-                                    .build();
+                        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
+                                .dataType(DataType.DOUBLE)
+                                .optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
+                                .updater(new NoOp())
+                                .dist(new UniformDistribution(-2, 2)).seed(12345L).graphBuilder()
+                                .addInputs("in")
+                                .addLayer("0", new ConvolutionLayer.Builder(2, 2).stride(1, 1).nOut(3)
+                                        .activation(afn).build(), "in")
+                                .addLayer("1", new BatchNormalization.Builder().useLogStd(useLogStd).build(), "0")
+                                .addLayer("2", new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
+                                        .kernelSize(2, 2).stride(1, 1).build(), "1")
+                                .addLayer("3", new BatchNormalization.Builder().useLogStd(useLogStd).build(), "2")
+                                .addLayer("4", new ActivationLayer.Builder().activation(afn).build(), "3")
+                                .addLayer("5", new OutputLayer.Builder(lf).activation(outputActivation)
+                                        .nOut(nOut).build(), "4")
+                                .setOutputs("5").setInputTypes(InputType.convolutional(hw, hw, depth))
+                                .build();

-                            ComputationGraph net = new ComputationGraph(conf);
-                            net.init();
-                            String name = new Object() {
-                            }.getClass().getEnclosingMethod().getName();
+                        ComputationGraph net = new ComputationGraph(conf);
+                        net.init();
+                        String name = new Object() {
+                        }.getClass().getEnclosingMethod().getName();

-                            if (doLearningFirst) {
-                                //Run a number of iterations of learning
-                                net.setInput(0, ds.getFeatures());
-                                net.setLabels(ds.getLabels());
-                                net.computeGradientAndScore();
-                                double scoreBefore = net.score();
-                                for (int k = 0; k < 20; k++)
-                                    net.fit(ds);
-                                net.computeGradientAndScore();
-                                double scoreAfter = net.score();
-                                //Can't test in 'characteristic mode of operation' if not learning
-                                String msg = name
-                                        + " - score did not (sufficiently) decrease during learning - activationFn="
-                                        + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
-                                        + ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
-                                        + ", scoreAfter=" + scoreAfter + ")";
-                                assertTrue(msg, scoreAfter < 0.9 * scoreBefore);
-                            }
-
-                            if (PRINT_RESULTS) {
-                                System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
-                                        + ", outputActivation=" + outputActivation + ", doLearningFirst="
-                                        + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
-                                for (int k = 0; k < net.getNumLayers(); k++)
-                                    System.out.println("Layer " + k + " # params: " + net.getLayer(k).numParams());
-                            }
-
-                            //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
-                            //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
-                            //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
-                            Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "3_mean", "3_var", "1_log10stdev", "3_log10stdev"));
-                            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE,
-                                    new INDArray[]{input}, new INDArray[]{labels}, null, null, excludeParams);
-
-                            assertTrue(gradOK);
-                            TestUtils.testModelSerialization(net);
+                        if (doLearningFirst) {
+                            //Run a number of iterations of learning
+                            net.setInput(0, ds.getFeatures());
+                            net.setLabels(ds.getLabels());
+                            net.computeGradientAndScore();
+                            double scoreBefore = net.score();
+                            for (int k = 0; k < 20; k++)
+                                net.fit(ds);
+                            net.computeGradientAndScore();
+                            double scoreAfter = net.score();
+                            //Can't test in 'characteristic mode of operation' if not learning
+                            String msg = name
+                                    + " - score did not (sufficiently) decrease during learning - activationFn="
+                                    + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+                                    + ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
+                                    + ", scoreAfter=" + scoreAfter + ")";
+                            assertTrue(msg, scoreAfter < 0.9 * scoreBefore);
                        }
+
+                        if (PRINT_RESULTS) {
+                            System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
+                                    + ", outputActivation=" + outputActivation + ", doLearningFirst="
+                                    + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
+                            for (int k = 0; k < net.getNumLayers(); k++)
+                                System.out.println("Layer " + k + " # params: " + net.getLayer(k).numParams());
+                        }
+
+                        //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
+                        //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
+                        //However, numerical gradient will be 0 as forward pass doesn't depend on this "parameter"
+                        Set<String> excludeParams = new HashSet<>(Arrays.asList("1_mean", "1_var", "3_mean", "3_var", "1_log10stdev", "3_log10stdev"));
+                        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE,
+                                new INDArray[]{input}, new INDArray[]{labels}, null, null, excludeParams);
+
+                        assertTrue(gradOK);
+                        TestUtils.testModelSerialization(net);
                    }
                }
            }
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java
@ -151,7 +151,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {

                                            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS,
                                                    DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS,
-                                                    RETURN_ON_FIRST_FAILURE, input, labels);
+                                                    RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 128);

                                            assertTrue(msg, gradOK);

@ -255,7 +255,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS,
                            DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS,
-                            RETURN_ON_FIRST_FAILURE, input, labels);
+                            RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 512);

                    assertTrue(msg, gradOK);

--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
@ -142,12 +142,6 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        // (a) activation function
        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
        // (c) Loss function (with specified output activations)
-        Activation[] activFns = {Activation.SIGMOID, Activation.TANH};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
-
-        LossFunctions.LossFunction[] lossFunctions =
-                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
-        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here

        DataSet ds = new IrisDataSetIterator(150, 150).next();
        ds.normalizeZeroMeanZeroUnitVariance();
@ -159,72 +153,74 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        double[] l1vals = {0.0, 0.0, 0.5, 0.0};
        double[] biasL2 = {0.0, 0.0, 0.0, 0.2};
        double[] biasL1 = {0.0, 0.0, 0.6, 0.0};
+        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.ELU, Activation.SOFTPLUS};
+        boolean[] characteristic = {false, true, false, true}; //If true: run some backprop steps first

-        for (Activation afn : activFns) {
-            for (boolean doLearningFirst : characteristic) {
-                for (int i = 0; i < lossFunctions.length; i++) {
-                    for (int k = 0; k < l2vals.length; k++) {
-                        LossFunctions.LossFunction lf = lossFunctions[i];
-                        Activation outputActivation = outputActivations[i];
-                        double l2 = l2vals[k];
-                        double l1 = l1vals[k];
+        LossFunctions.LossFunction[] lossFunctions =
+                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE, LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
+        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH, Activation.SOFTMAX, Activation.IDENTITY}; //i.e., lossFunctions[i] used with outputActivations[i] here

-                        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
-                                .dataType(DataType.DOUBLE)
-                                .l2(l2).l1(l1).l2Bias(biasL2[k]).l1Bias(biasL1[k])
-                                .optimizationAlgo(
-                                        OptimizationAlgorithm.CONJUGATE_GRADIENT)
-                                .seed(12345L).list()
-                                .layer(0, new ConvolutionLayer.Builder(new int[]{1, 1}).nIn(1).nOut(6)
-                                        .weightInit(WeightInit.XAVIER).activation(afn)
-                                        .updater(new NoOp()).build())
-                                .layer(1, new OutputLayer.Builder(lf).activation(outputActivation).nOut(3)
-                                        .weightInit(WeightInit.XAVIER).updater(new NoOp()).build())
+        for( int i=0; i<l2vals.length; i++ ){
+            Activation afn = activFns[i];
+            boolean doLearningFirst = characteristic[i];
+            LossFunctions.LossFunction lf = lossFunctions[i];
+            Activation outputActivation = outputActivations[i];
+            double l2 = l2vals[i];
+            double l1 = l1vals[i];

-                                .setInputType(InputType.convolutionalFlat(1, 4, 1));
+            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
+                    .dataType(DataType.DOUBLE)
+                    .l2(l2).l1(l1).l2Bias(biasL2[i]).l1Bias(biasL1[i])
+                    .optimizationAlgo(
+                            OptimizationAlgorithm.CONJUGATE_GRADIENT)
+                    .seed(12345L).list()
+                    .layer(0, new ConvolutionLayer.Builder(new int[]{1, 1}).nIn(1).nOut(6)
+                            .weightInit(WeightInit.XAVIER).activation(afn)
+                            .updater(new NoOp()).build())
+                    .layer(1, new OutputLayer.Builder(lf).activation(outputActivation).nOut(3)
+                            .weightInit(WeightInit.XAVIER).updater(new NoOp()).build())

-                        MultiLayerConfiguration conf = builder.build();
+                    .setInputType(InputType.convolutionalFlat(1, 4, 1));

-                        MultiLayerNetwork mln = new MultiLayerNetwork(conf);
-                        mln.init();
-                        String testName = new Object() {
-                        }.getClass().getEnclosingMethod().getName();
+            MultiLayerConfiguration conf = builder.build();

-                        if (doLearningFirst) {
-                            //Run a number of iterations of learning
-                            mln.setInput(ds.getFeatures());
-                            mln.setLabels(ds.getLabels());
-                            mln.computeGradientAndScore();
-                            double scoreBefore = mln.score();
-                            for (int j = 0; j < 10; j++)
-                                mln.fit(ds);
-                            mln.computeGradientAndScore();
-                            double scoreAfter = mln.score();
-                            //Can't test in 'characteristic mode of operation' if not learning
-                            String msg = testName
-                                    + "- score did not (sufficiently) decrease during learning - activationFn="
-                                    + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
-                                    + ", doLearningFirst=" + doLearningFirst + " (before=" + scoreBefore
-                                    + ", scoreAfter=" + scoreAfter + ")";
-                            assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
-                        }
+            MultiLayerNetwork mln = new MultiLayerNetwork(conf);
+            mln.init();
+            String testName = new Object() {
+            }.getClass().getEnclosingMethod().getName();

-                        if (PRINT_RESULTS) {
-                            System.out.println(testName + "- activationFn=" + afn + ", lossFn=" + lf
-                                    + ", outputActivation=" + outputActivation + ", doLearningFirst="
-                                    + doLearningFirst);
-                            for (int j = 0; j < mln.getnLayers(); j++)
-                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
-                        }
-
-                        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                        assertTrue(gradOK);
-                        TestUtils.testModelSerialization(mln);
-                    }
-                }
+            if (doLearningFirst) {
+                //Run a number of iterations of learning
+                mln.setInput(ds.getFeatures());
+                mln.setLabels(ds.getLabels());
+                mln.computeGradientAndScore();
+                double scoreBefore = mln.score();
+                for (int j = 0; j < 10; j++)
+                    mln.fit(ds);
+                mln.computeGradientAndScore();
+                double scoreAfter = mln.score();
+                //Can't test in 'characteristic mode of operation' if not learning
+                String msg = testName
+                        + "- score did not (sufficiently) decrease during learning - activationFn="
+                        + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+                        + ", doLearningFirst=" + doLearningFirst + " (before=" + scoreBefore
+                        + ", scoreAfter=" + scoreAfter + ")";
+                assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
            }
+
+            if (PRINT_RESULTS) {
+                System.out.println(testName + "- activationFn=" + afn + ", lossFn=" + lf
+                        + ", outputActivation=" + outputActivation + ", doLearningFirst="
+                        + doLearningFirst);
+                for (int j = 0; j < mln.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(gradOK);
+            TestUtils.testModelSerialization(mln);
        }
    }

@ -369,56 +365,43 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int[] padding = {0, 0};
        int size = 2;

-        String[] activations = {"sigmoid", "tanh"};
-        SubsamplingLayer.PoolingType[] poolingTypes =
-                new SubsamplingLayer.PoolingType[]{SubsamplingLayer.PoolingType.MAX,
-                        SubsamplingLayer.PoolingType.AVG, SubsamplingLayer.PoolingType.PNORM};
+        for (int minibatchSize : minibatchSizes) {
+            INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-        for (String afn : activations) {
-            for (SubsamplingLayer.PoolingType poolingType : poolingTypes) {
-                for (int minibatchSize : minibatchSizes) {
-                    INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
-                    INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                    for (int i = 0; i < minibatchSize; i++) {
-                        labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                    }
+            MultiLayerConfiguration conf =
+                    new NeuralNetConfiguration.Builder()
+                            .dataType(DataType.DOUBLE)
+                            .updater(new NoOp())
+                            .dist(new NormalDistribution(0, 1))
+                            .list().layer(new ConvolutionLayer.Builder(kernel,
+                            stride, padding).nIn(inputDepth)
+                            .nOut(3).build())//output: (5-2+0)/1+1 = 4
+                            .layer(new Upsampling2D.Builder().size(size).build()) //output: 4*2 =8 -> 8x8x3
+                            .layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                                    .activation(Activation.SOFTMAX).nIn(8 * 8 * 3)
+                                    .nOut(4).build())
+                            .setInputType(InputType.convolutionalFlat(height, width,
+                                    inputDepth))
+                            .build();

-                    MultiLayerConfiguration conf =
-                            new NeuralNetConfiguration.Builder()
-                                    .dataType(DataType.DOUBLE)
-                                    .updater(new NoOp())
-                                    .dist(new NormalDistribution(0, 1))
-                                    .list().layer(new ConvolutionLayer.Builder(kernel,
-                                    stride, padding).nIn(inputDepth)
-                                    .nOut(3).build())//output: (5-2+0)/1+1 = 4
-                                    .layer(new Upsampling2D.Builder().size(size).build()) //output: 4*2 =8 -> 8x8x3
-                                    .layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nIn(8 * 8 * 3)
-                                            .nOut(4).build())
-                                    .setInputType(InputType.convolutionalFlat(height, width,
-                                            inputDepth))
-                                    .build();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                    net.init();
+            String msg = "Upsampling - minibatch=" + minibatchSize;

-                    String msg = "PoolingType=" + poolingType + ", minibatch=" + minibatchSize + ", activationFn="
-                            + afn;
-
-                    if (PRINT_RESULTS) {
-                        System.out.println(msg);
-                        for (int j = 0; j < net.getnLayers(); j++)
-                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
-                    }
-
-                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    assertTrue(msg, gradOK);
-
-                    TestUtils.testModelSerialization(net);
-                }
+            if (PRINT_RESULTS) {
+                System.out.println(msg);
+                for (int j = 0; j < net.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -695,60 +678,56 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testCnnSamePaddingMode() {
        int nOut = 2;

-        int[] minibatchSizes = {1, 3};
+        int[] minibatchSizes = {1, 3, 3, 2, 1, 2};
+        int[] heights = new int[]{4, 5, 6, 5, 4, 4}; //Same padding mode: insensitive to exact input size...
+        int[] kernelSizes = new int[]{2, 3, 2, 3, 2, 3};
+        int[] inputDepths = {1, 2, 4, 3, 2, 3};
+
        int width = 5;
-        int[] heights = new int[]{4, 5, 6}; //Same padding mode: insensitive to exact input size...
-        int[] kernelSizes = new int[]{2, 3};
-        int[] inputDepths = {1, 2, 4};

        Nd4j.getRandom().setSeed(12345);

-        for (int inputDepth : inputDepths) {
-            for (int minibatchSize : minibatchSizes) {
-                for (int height : heights) {
-                    for (int k : kernelSizes) {
+        for( int i=0; i<minibatchSizes.length; i++ ){
+            int inputDepth = inputDepths[i];
+            int minibatchSize = minibatchSizes[i];
+            int height = heights[i];
+            int k = kernelSizes[i];

-                        INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
-                        INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                        for (int i = 0; i < minibatchSize; i++) {
-                            labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                        }
+            INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-                        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
-                                .dataType(DataType.DOUBLE)
-                                .updater(new NoOp())
-                                .activation(Activation.TANH).convolutionMode(Same).list()
-                                .layer(0, new ConvolutionLayer.Builder().name("layer 0").kernelSize(k, k)
-                                        .stride(1, 1).padding(0, 0).nIn(inputDepth).nOut(2).build())
-                                .layer(1, new SubsamplingLayer.Builder()
-                                        .poolingType(SubsamplingLayer.PoolingType.MAX).kernelSize(k, k)
-                                        .stride(1, 1).padding(0, 0).build())
-                                .layer(2, new ConvolutionLayer.Builder().nIn(2).nOut(2).kernelSize(k, k)
-                                        .stride(1, 1).padding(0, 0).build())
-                                .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                        .activation(Activation.SOFTMAX).nOut(nOut).build())
-                                .setInputType(InputType.convolutionalFlat(height, width, inputDepth)).build();
+            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
+                    .dataType(DataType.DOUBLE)
+                    .updater(new NoOp())
+                    .activation(Activation.TANH).convolutionMode(Same).list()
+                    .layer(0, new ConvolutionLayer.Builder().name("layer 0").kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).nIn(inputDepth).nOut(2).build())
+                    .layer(1, new SubsamplingLayer.Builder()
+                            .poolingType(SubsamplingLayer.PoolingType.MAX).kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).build())
+                    .layer(2, new ConvolutionLayer.Builder().nIn(2).nOut(2).kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).build())
+                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                            .activation(Activation.SOFTMAX).nOut(nOut).build())
+                    .setInputType(InputType.convolutionalFlat(height, width, inputDepth)).build();

-                        MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                        net.init();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                        for (int i = 0; i < net.getLayers().length; i++) {
-                            System.out.println("nParams, layer " + i + ": " + net.getLayer(i).numParams());
-                        }
-
-                        String msg = "Minibatch=" + minibatchSize + ", inDepth=" + inputDepth + ", height=" + height
-                                + ", kernelSize=" + k;
-                        System.out.println(msg);
-
-                        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                        assertTrue(msg, gradOK);
-
-                        TestUtils.testModelSerialization(net);
-                    }
-                }
+            for (int j = 0; j < net.getLayers().length; j++) {
+                System.out.println("nParams, layer " + j + ": " + net.getLayer(j).numParams());
            }
+
+            String msg = "Minibatch=" + minibatchSize + ", inDepth=" + inputDepth + ", height=" + height
+                    + ", kernelSize=" + k;
+            System.out.println(msg);
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -809,7 +788,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {

                            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input,
-                                    labels);
+                                    labels, null, null, true, 128);

                            assertTrue(msg, gradOK);

@ -827,68 +806,66 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        Nd4j.getRandom().setSeed(12345);
        int nOut = 4;

-        int[] minibatchSizes = {1, 3};
+
        int width = 6;
        int height = 6;
-        int[] inputDepths = {1, 3};
+

        int[] kernel = {2, 2};
        int[] stride = {1, 1};
        int[] padding = {0, 0};

+        int[] minibatchSizes = {1, 3, 2};
+        int[] inputDepths = {1, 3, 2};
        int[][] zeroPadLayer = new int[][]{{0, 0, 0, 0}, {1, 1, 0, 0}, {2, 2, 2, 2}};

-        for (int inputDepth : inputDepths) {
-            for (int minibatchSize : minibatchSizes) {
-                INDArray input = Nd4j.rand(new int[]{minibatchSize, inputDepth, height, width});
-                INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                for (int i = 0; i < minibatchSize; i++) {
-                    labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                }
-                for (int[] zeroPad : zeroPadLayer) {
+        for( int i=0; i<minibatchSizes.length; i++ ){
+            int minibatchSize = minibatchSizes[i];
+            int inputDepth = inputDepths[i];
+            int[] zeroPad = zeroPadLayer[i];
+            INDArray input = Nd4j.rand(DataType.DOUBLE, new int[]{minibatchSize, inputDepth, height, width});
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-                    MultiLayerConfiguration conf =
-                            new NeuralNetConfiguration.Builder().updater(new NoOp())
-                                    .dataType(DataType.DOUBLE)
-                                    .dist(new NormalDistribution(0, 1)).list()
-                                    .layer(0, new ConvolutionLayer.Builder(kernel, stride, padding)
-                                            .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
-                                    .layer(1, new ZeroPaddingLayer.Builder(zeroPad).build()).layer(2,
-                                    new ConvolutionLayer.Builder(kernel, stride,
-                                            padding).nIn(3).nOut(3).build())//output: (6-2+0)/1+1 = 5
-                                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nOut(4).build())
-                                    .setInputType(InputType.convolutional(height, width, inputDepth))
-                                    .build();
+            MultiLayerConfiguration conf =
+                    new NeuralNetConfiguration.Builder().updater(new NoOp())
+                            .dataType(DataType.DOUBLE)
+                            .dist(new NormalDistribution(0, 1)).list()
+                            .layer(0, new ConvolutionLayer.Builder(kernel, stride, padding)
+                                    .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                            .layer(1, new ZeroPaddingLayer.Builder(zeroPad).build()).layer(2,
+                            new ConvolutionLayer.Builder(kernel, stride,
+                                    padding).nIn(3).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                            .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                                    .activation(Activation.SOFTMAX).nOut(4).build())
+                            .setInputType(InputType.convolutional(height, width, inputDepth))
+                            .build();

-                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                    net.init();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                    //Check zero padding activation shape
-                    org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer zpl =
-                            (org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer) net.getLayer(1);
-                    val expShape = new long[]{minibatchSize, inputDepth, height + zeroPad[0] + zeroPad[1],
-                            width + zeroPad[2] + zeroPad[3]};
-                    INDArray out = zpl.activate(input, false, LayerWorkspaceMgr.noWorkspaces());
-                    assertArrayEquals(expShape, out.shape());
+            //Check zero padding activation shape
+            org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer zpl =
+                    (org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer) net.getLayer(1);
+            val expShape = new long[]{minibatchSize, inputDepth, height + zeroPad[0] + zeroPad[1],
+                    width + zeroPad[2] + zeroPad[3]};
+            INDArray out = zpl.activate(input, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(expShape, out.shape());

-                    String msg = "minibatch=" + minibatchSize + ", channels=" + inputDepth + ", zeroPad = "
-                            + Arrays.toString(zeroPad);
+            String msg = "minibatch=" + minibatchSize + ", channels=" + inputDepth + ", zeroPad = "
+                    + Arrays.toString(zeroPad);

-                    if (PRINT_RESULTS) {
-                        System.out.println(msg);
-                        for (int j = 0; j < net.getnLayers(); j++)
-                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
-                    }
-
-                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    assertTrue(msg, gradOK);
-
-                    TestUtils.testModelSerialization(net);
-                }
+            if (PRINT_RESULTS) {
+                System.out.println(msg);
+                for (int j = 0; j < net.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -896,12 +873,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testDeconvolution2D() {
        int nOut = 2;

-        int[] minibatchSizes = new int[]{1, 3, 1, 3, 1, 3, 1, 3};
-        int[] kernelSizes = new int[]{1, 1, 3, 3, 1, 1, 3, 3};
-        int[] strides = {1, 1, 1, 1, 2, 2, 2, 2};
-        int[] dilation = {1, 2, 2, 1, 1, 1, 2, 2};
-        Activation[] activations = new Activation[]{Activation.SIGMOID, Activation.TANH, Activation.TANH, Activation.TANH, Activation.TANH, Activation.SIGMOID, Activation.SIGMOID, Activation.SIGMOID};
-        ConvolutionMode[] cModes = new ConvolutionMode[]{Same, Same, Same, Same, Truncate, Truncate, Truncate, Truncate};
+        int[] minibatchSizes = new int[]{1, 3, 3, 1, 3};
+        int[] kernelSizes = new int[]{1, 1, 1, 3, 3};
+        int[] strides = {1, 1, 2, 2, 2};
+        int[] dilation = {1, 2, 1, 2, 2};
+        Activation[] activations = new Activation[]{Activation.SIGMOID, Activation.TANH, Activation.SIGMOID, Activation.SIGMOID, Activation.SIGMOID};
+        ConvolutionMode[] cModes = new ConvolutionMode[]{Same, Same, Truncate, Truncate, Truncate};
        int width = 7;
        int height = 7;
        int inputDepth = 3;
@ -954,7 +931,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 100);

            assertTrue(msg, gradOK);

@ -967,21 +944,17 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int nOut = 2;

        int[] minibatchSizes = new int[]{1, 3};
-        int width = 8;
-        int height = 8;
+        int width = 6;
+        int height = 6;
        int inputDepth = 3;
-        int[] kernelSizes = new int[]{2, 3};
-        int[] strides = {1, 2};
-        int[] dilation = {1, 2};
-        ConvolutionMode[] cModes = new ConvolutionMode[]{ConvolutionMode.Truncate};

        Nd4j.getRandom().setSeed(12345);

-        int[] ks = new int[]{1, 3, 1, 3, 1, 3, 1, 3};
-        int[] ss = new int[]{1, 1, 2, 2, 1, 1, 2, 2};
-        int[] ds = new int[]{1, 1, 1, 1, 2, 2, 2, 2};
-        ConvolutionMode[] cms = new ConvolutionMode[]{Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate};
-        int[] mb = new int[]{1, 1, 3, 3, 3, 1, 3, 3};
+        int[] ks = new int[]{1, 3, 3, 1, 3};
+        int[] ss = new int[]{1, 1, 1, 2, 2};
+        int[] ds = new int[]{1, 1, 2, 2, 2};
+        ConvolutionMode[] cms = new ConvolutionMode[]{Truncate, Truncate, Truncate, Truncate, Truncate};
+        int[] mb = new int[]{1, 1, 1, 3, 3};

        for (int t = 0; t < ks.length; t++) {

@ -1030,7 +1003,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 50);    //Most params are in output layer

            assertTrue(msg, gradOK);

@ -1042,18 +1015,18 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testCnnDilated() {
        int nOut = 2;

-        int minibatchSize = 3;
+        int minibatchSize = 2;
        int width = 8;
        int height = 8;
-        int inputDepth = 3;
+        int inputDepth = 2;

        Nd4j.getRandom().setSeed(12345);

-        boolean[] sub = new boolean[]{true, false, true, false, true, false, true, false};
-        int[] stride = new int[]{1, 1, 2, 2, 1, 1, 2, 2};
-        int[] kernel = new int[]{2, 2, 2, 2, 3, 3, 3, 3};
-        int[] ds = new int[]{2, 3, 3, 2, 2, 3, 3, 2};
-        ConvolutionMode[] cms = new ConvolutionMode[]{Same, Same, Same, Truncate, Truncate, Truncate, Same, Truncate};
+        boolean[] sub = new boolean[]{true, true, false, true, false};
+        int[] stride = new int[]{1, 1, 1, 2, 2};
+        int[] kernel = new int[]{2, 3, 3, 3, 3};
+        int[] ds = new int[]{2, 2, 3, 3, 2};
+        ConvolutionMode[] cms = new ConvolutionMode[]{Same, Truncate, Truncate, Same, Truncate};


        for (int t = 0; t < sub.length; t++) {
@ -1126,7 +1099,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    @Test
    public void testCropping2DLayer() {
        Nd4j.getRandom().setSeed(12345);
-        int nOut = 4;
+        int nOut = 2;

        int[] minibatchSizes = {1, 3};
        int width = 12;
@ -1155,11 +1128,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                                    .convolutionMode(ConvolutionMode.Same)
                                    .weightInit(new NormalDistribution(0, 1)).list()
                                    .layer(new ConvolutionLayer.Builder(kernel, stride, padding)
-                                            .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                                            .nIn(inputDepth).nOut(2).build())//output: (6-2+0)/1+1 = 5
                                    .layer(new Cropping2D(crop))
-                                    .layer(new ConvolutionLayer.Builder(kernel, stride, padding).nIn(3).nOut(3).build())
+                                    .layer(new ConvolutionLayer.Builder(kernel, stride, padding).nIn(2).nOut(2).build())
+                                    .layer(new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.AVG).kernelSize(3, 3).stride(3, 3).build())
                                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nOut(4).build())
+                                            .activation(Activation.SOFTMAX).nOut(nOut).build())
                                    .setInputType(InputType.convolutional(height, width, inputDepth))
                                    .build();

@ -1184,7 +1158,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                    }

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 160);

                    assertTrue(msg, gradOK);

@ -1200,16 +1174,16 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int depthMultiplier = 2;
        int nOut = nIn * depthMultiplier;

-        int width = 8;
-        int height = 8;
+        int width = 5;
+        int height = 5;

        Nd4j.getRandom().setSeed(12345);

-        int[] ks = new int[]{1,3,1,3,1,3,1,3};
-        int[] ss = new int[]{1,1,2,2,1,1,2,2};
+        int[] ks = new int[]{1,3,3,1,3};
+        int[] ss = new int[]{1,1,1,2,2};
        ConvolutionMode[] cms = new ConvolutionMode[]{
-                Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate};
-        int[] mb = new int[]{1,1,3,3,3,1,3,3};
+                Truncate, Truncate, Truncate, Truncate, Truncate};
+        int[] mb = new int[]{1,1,1,3,3};

        for( int t=0; t<ks.length; t++ ){

@ -1255,7 +1229,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 256);

            assertTrue(msg, gradOK);

--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CapsnetGradientCheckTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/CapsnetGradientCheckTest.java
@ -39,6 +39,8 @@ import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.learning.config.NoOp;
 import org.nd4j.linalg.lossfunctions.impl.LossNegativeLogLikelihood;

+import java.util.Random;
+
 public class CapsnetGradientCheckTest extends BaseDL4JTest {

    private static final boolean PRINT_RESULTS = true;
@ -70,6 +72,7 @@ public class CapsnetGradientCheckTest extends BaseDL4JTest {
                    for (int capsule : capsules) {
                        for (int capsuleDim : capsuleDims) {
                            for (int minibatchSize : minibatchSizes) {
+
                                INDArray input = Nd4j.rand(minibatchSize, inputDepth * height * width).mul(10)
                                        .reshape(-1, inputDepth, height, width);
                                INDArray labels = Nd4j.zeros(minibatchSize, capsule);
@ -110,7 +113,7 @@ public class CapsnetGradientCheckTest extends BaseDL4JTest {
                                boolean gradOK = GradientCheckUtil
                                        .checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input,
-                                                labels);
+                                                labels, null, null, true, 100);

                                assertTrue(msg, gradOK);

--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/DropoutGradientCheck.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/DropoutGradientCheck.java
@ -100,15 +100,15 @@ public class DropoutGradientCheck extends BaseDL4JTest {
                        .list();

                if(cnn){
-                    builder.layer(new ConvolutionLayer.Builder().kernelSize(3,3).stride(1,1).nOut(3).build());
-                    builder.layer(new ConvolutionLayer.Builder().kernelSize(3,3).stride(1,1).nOut(3).build());
-                    builder.setInputType(InputType.convolutional(8,8,3));
+                    builder.layer(new ConvolutionLayer.Builder().kernelSize(3,3).stride(2,2).nOut(2).build());
+                    builder.layer(new ConvolutionLayer.Builder().kernelSize(3,3).stride(2,2).nOut(2).build());
+                    builder.setInputType(InputType.convolutional(6,6,2));
                } else {
-                    builder.layer(new DenseLayer.Builder().nOut(12).build());
-                    builder.layer(new DenseLayer.Builder().nOut(12).build());
-                    builder.setInputType(InputType.feedForward(8));
+                    builder.layer(new DenseLayer.Builder().nOut(3).build());
+                    builder.layer(new DenseLayer.Builder().nOut(3).build());
+                    builder.setInputType(InputType.feedForward(6));
                }
-                builder.layer(new OutputLayer.Builder().nOut(10).activation(Activation.SOFTMAX).lossFunction(LossFunction.MCXENT).build());
+                builder.layer(new OutputLayer.Builder().nOut(3).activation(Activation.SOFTMAX).lossFunction(LossFunction.MCXENT).build());

                MultiLayerConfiguration conf = builder.build();
                //Remove spatial dropout from output layer - can't be used for 2d input
@ -123,11 +123,11 @@ public class DropoutGradientCheck extends BaseDL4JTest {

                INDArray f;
                if(cnn){
-                    f = Nd4j.rand(new int[]{minibatch, 3, 8, 8}).muli(10).subi(5);
+                    f = Nd4j.rand(new int[]{minibatch, 2, 6, 6}).muli(10).subi(5);
                } else {
-                    f = Nd4j.rand(minibatch, 8).muli(10).subi(5);
+                    f = Nd4j.rand(minibatch, 6).muli(10).subi(5);
                }
-                INDArray l = TestUtils.randomOneHot(minibatch, 10);
+                INDArray l = TestUtils.randomOneHot(minibatch, 3);

                log.info("*** Starting test: " + msg + " ***");
                boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java
@ -24,6 +24,7 @@ import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
 import org.nd4j.linalg.activations.Activation;
@ -53,11 +54,11 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
    private static final double DEFAULT_MIN_ABS_ERROR = 1e-8;

    @Test
-    public void testLSTMGlobalPoolingBasicMultiLayer() {
+    public void testRNNGlobalPoolingBasicMultiLayer() {
        //Basic test of global pooling w/ LSTM
        Nd4j.getRandom().setSeed(12345L);

-        int timeSeriesLength = 10;
+        int timeSeriesLength = 5;
        int nIn = 5;
        int layerSize = 4;
        int nOut = 2;
@ -73,7 +74,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                                .dataType(DataType.DOUBLE)
                                .updater(new NoOp())
                                .dist(new NormalDistribution(0, 1.0)).seed(12345L).list()
-                                .layer(0, new GravesLSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
+                                .layer(0, new SimpleRnn.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
                                                .build())
                                .layer(1, new GlobalPoolingLayer.Builder().poolingType(pt).build())
                                .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
@ -84,20 +85,9 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                mln.init();

                Random r = new Random(12345L);
-                INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength);
-                for (int i = 0; i < miniBatchSize; i++) {
-                    for (int j = 0; j < nIn; j++) {
-                        for (int k = 0; k < timeSeriesLength; k++) {
-                            input.putScalar(new int[] {i, j, k}, r.nextDouble() - 0.5);
-                        }
-                    }
-                }
+                INDArray input = Nd4j.rand(DataType.DOUBLE, miniBatchSize, nIn, timeSeriesLength).subi(0.5);

-                INDArray labels = Nd4j.zeros(miniBatchSize, nOut);
-                for (int i = 0; i < miniBatchSize; i++) {
-                    int idx = r.nextInt(nOut);
-                    labels.putScalar(i, idx, 1.0);
-                }
+                INDArray labels = TestUtils.randomOneHot(miniBatchSize, nOut).castTo(DataType.DOUBLE);

                if (PRINT_RESULTS) {
                    System.out.println("testLSTMGlobalPoolingBasicMultiLayer() - " + pt + ", minibatch = "
@ -175,12 +165,12 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {

    @Test
    public void testLSTMWithMasking() {
-        //Basic test of GravesLSTM layer
+        //Basic test of LSTM layer
        Nd4j.getRandom().setSeed(12345L);

-        int timeSeriesLength = 10;
-        int nIn = 5;
-        int layerSize = 4;
+        int timeSeriesLength = 5;
+        int nIn = 4;
+        int layerSize = 3;
        int nOut = 2;

        int miniBatchSize = 3;
@ -193,7 +183,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                            .dataType(DataType.DOUBLE)
                            .updater(new NoOp())
                            .dist(new NormalDistribution(0, 1.0)).seed(12345L).list()
-                            .layer(0, new GravesLSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
+                            .layer(0, new LSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
                                            .build())
                            .layer(1, new GlobalPoolingLayer.Builder().poolingType(pt).build())
                            .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
@ -204,14 +194,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
            mln.init();

            Random r = new Random(12345L);
-            INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength);
-            for (int i = 0; i < miniBatchSize; i++) {
-                for (int j = 0; j < nIn; j++) {
-                    for (int k = 0; k < timeSeriesLength; k++) {
-                        input.putScalar(new int[] {i, j, k}, r.nextDouble() - 0.5);
-                    }
-                }
-            }
+            INDArray input = Nd4j.rand(DataType.DOUBLE, miniBatchSize, nIn, timeSeriesLength).subi(0.5);

            INDArray featuresMask = Nd4j.create(miniBatchSize, timeSeriesLength);
            for (int i = 0; i < miniBatchSize; i++) {
@ -221,12 +204,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                }
            }

-            INDArray labels = Nd4j.zeros(miniBatchSize, nOut);
-            for (int i = 0; i < miniBatchSize; i++) {
-                int idx = r.nextInt(nOut);
-                labels.putScalar(i, idx, 1.0);
-            }
-
+            INDArray labels = TestUtils.randomOneHot(miniBatchSize, nOut);
            mln.setLayerMaskArrays(featuresMask, null);

            if (PRINT_RESULTS) {
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java
@ -32,6 +32,7 @@ import org.deeplearning4j.nn.conf.graph.rnn.LastTimeStepVertex;
 import org.deeplearning4j.nn.conf.graph.rnn.ReverseTimeSeriesVertex;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.conf.preprocessor.CnnToFeedForwardPreProcessor;
 import org.deeplearning4j.nn.conf.preprocessor.FeedForwardToRnnPreProcessor;
 import org.deeplearning4j.nn.conf.preprocessor.RnnToFeedForwardPreProcessor;
@ -334,7 +335,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
    }

    @Test
-    public void testLSTMWithMerging() {
+    public void testRNNWithMerging() {

        Nd4j.getRandom().setSeed(12345);
        ComputationGraphConfiguration conf =
@ -345,23 +346,23 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                                        .updater(new NoOp()).graphBuilder().addInputs("input")
                                        .setOutputs("out")
                                        .addLayer("lstm1",
-                                                        new GravesLSTM.Builder().nIn(3).nOut(4)
+                                                        new SimpleRnn.Builder().nIn(3).nOut(3)
                                                                        .activation(Activation.TANH).build(),
                                                        "input")
                                        .addLayer("lstm2",
-                                                        new GravesLSTM.Builder().nIn(4).nOut(4)
+                                                        new SimpleRnn.Builder().nIn(3).nOut(3)
                                                                        .activation(Activation.TANH).build(),
                                                        "lstm1")
                                        .addLayer("dense1",
-                                                        new DenseLayer.Builder().nIn(4).nOut(4)
+                                                        new DenseLayer.Builder().nIn(3).nOut(3)
                                                                        .activation(Activation.SIGMOID).build(),
                                                        "lstm1")
                                        .addLayer("lstm3",
-                                                        new GravesLSTM.Builder().nIn(4).nOut(4)
+                                                        new SimpleRnn.Builder().nIn(3).nOut(3)
                                                                        .activation(Activation.TANH).build(),
                                                        "dense1")
                                        .addVertex("merge", new MergeVertex(), "lstm2", "lstm3")
-                                        .addLayer("out", new RnnOutputLayer.Builder().nIn(8).nOut(3)
+                                        .addLayer("out", new RnnOutputLayer.Builder().nIn(6).nOut(3)
                                                        .activation(Activation.SOFTMAX)
                                                        .lossFunction(LossFunctions.LossFunction.MCXENT).build(),
                                                        "merge")
@ -373,13 +374,8 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        graph.init();

        Random r = new Random(12345);
-        INDArray input = Nd4j.rand(new int[] {3, 3, 5});
-        INDArray labels = Nd4j.zeros(3, 3, 5);
-        for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 5; j++) {
-                labels.putScalar(new int[] {i, r.nextInt(3), j}, 1.0);
-            }
-        }
+        INDArray input = Nd4j.rand(new int[] {2, 3, 4});
+        INDArray labels = TestUtils.randomOneHotTimeSeries(2, 3, 4);

        if (PRINT_RESULTS) {
            System.out.println("testLSTMWithMerging()");
@ -401,13 +397,12 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        Nd4j.getRandom().setSeed(1234);
        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(1234)
                        .dataType(DataType.DOUBLE)
-                        .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-                        .dist(new NormalDistribution(0, 1))
+                        .weightInit(new NormalDistribution(0, 1))
                        .updater(new NoOp()).graphBuilder().addInputs("input").setOutputs("out")
-                        .addLayer("lstm1", new GravesLSTM.Builder().nIn(3).nOut(8).activation(Activation.TANH).build(),
+                        .addLayer("lstm1", new LSTM.Builder().nIn(3).nOut(6).activation(Activation.TANH).build(),
                                        "input")
-                        .addVertex("subset", new SubsetVertex(0, 3), "lstm1")
-                        .addLayer("out", new RnnOutputLayer.Builder().nIn(4).nOut(3).activation(Activation.SOFTMAX)
+                        .addVertex("subset", new SubsetVertex(0, 2), "lstm1")
+                        .addLayer("out", new RnnOutputLayer.Builder().nIn(3).nOut(2).activation(Activation.SOFTMAX)
                                        .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "subset")
                        .build();

@ -415,13 +410,8 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        graph.init();

        Random r = new Random(12345);
-        INDArray input = Nd4j.rand(new int[] {3, 3, 5});
-        INDArray labels = Nd4j.zeros(3, 3, 5);
-        for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 5; j++) {
-                labels.putScalar(new int[] {i, r.nextInt(3), j}, 1.0);
-            }
-        }
+        INDArray input = Nd4j.rand(new int[] {2, 3, 4});
+        INDArray labels = TestUtils.randomOneHotTimeSeries(2, 2, 4);

        if (PRINT_RESULTS) {
            System.out.println("testLSTMWithSubset()");
@ -447,10 +437,10 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                        .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
                        .dist(new NormalDistribution(0, 1))
                        .updater(new NoOp()).graphBuilder().addInputs("input").setOutputs("out")
-                        .addLayer("lstm1", new GravesLSTM.Builder().nIn(3).nOut(4).activation(Activation.TANH).build(),
+                        .addLayer("lstm1", new LSTM.Builder().nIn(3).nOut(4).activation(Activation.TANH).build(),
                                        "input")
                        .addVertex("lastTS", new LastTimeStepVertex("input"), "lstm1")
-                        .addLayer("out", new OutputLayer.Builder().nIn(4).nOut(3).activation(Activation.SOFTMAX)
+                        .addLayer("out", new OutputLayer.Builder().nIn(4).nOut(2).activation(Activation.SOFTMAX)
                                        .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "lastTS")
                        .build();

@ -458,11 +448,8 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        graph.init();

        Random r = new Random(12345);
-        INDArray input = Nd4j.rand(new int[] {3, 3, 5});
-        INDArray labels = Nd4j.zeros(3, 3); //Here: labels are 2d (due to LastTimeStepVertex)
-        for (int i = 0; i < 3; i++) {
-            labels.putScalar(new int[] {i, r.nextInt(3)}, 1.0);
-        }
+        INDArray input = Nd4j.rand(new int[] {2, 3, 4});
+        INDArray labels = TestUtils.randomOneHot(2, 2); //Here: labels are 2d (due to LastTimeStepVertex)

        if (PRINT_RESULTS) {
            System.out.println("testLSTMWithLastTimeStepVertex()");
@ -503,16 +490,16 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                                        .updater(new NoOp()).graphBuilder()
                                        .addInputs("input1", "input2").setOutputs("out")
                                        .addLayer("lstm1",
-                                                        new GravesLSTM.Builder().nIn(3).nOut(4)
+                                                        new LSTM.Builder().nIn(3).nOut(3)
                                                                        .activation(Activation.TANH).build(),
                                                        "input1")
                                        .addLayer("lstm2",
-                                                        new GravesLSTM.Builder().nIn(4).nOut(5)
+                                                        new LSTM.Builder().nIn(2).nOut(4)
                                                                        .activation(Activation.SOFTSIGN).build(),
                                                        "input2")
                                        .addVertex("lastTS", new LastTimeStepVertex("input2"), "lstm2")
                                        .addVertex("duplicate", new DuplicateToTimeSeriesVertex("input2"), "lastTS")
-                                        .addLayer("out", new RnnOutputLayer.Builder().nIn(5 + 4).nOut(3)
+                                        .addLayer("out", new RnnOutputLayer.Builder().nIn(3+4).nOut(2)
                                                        .activation(Activation.SOFTMAX)
                                                        .lossFunction(LossFunctions.LossFunction.MCXENT).build(),
                                                        "lstm1", "duplicate")
@ -522,14 +509,9 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        graph.init();

        Random r = new Random(12345);
-        INDArray input1 = Nd4j.rand(new int[] {3, 3, 5});
-        INDArray input2 = Nd4j.rand(new int[] {3, 4, 5});
-        INDArray labels = Nd4j.zeros(3, 3, 5);
-        for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 5; j++) {
-                labels.putScalar(new int[] {i, r.nextInt(3), j}, 1.0);
-            }
-        }
+        INDArray input1 = Nd4j.rand(new int[] {2, 3, 4});
+        INDArray input2 = Nd4j.rand(new int[] {2, 2, 4});
+        INDArray labels = TestUtils.randomOneHotTimeSeries(2, 2, 4);

        if (PRINT_RESULTS) {
            System.out.println("testLSTMWithDuplicateToTimeSeries()");
@ -558,16 +540,16 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                        .updater(new NoOp()).graphBuilder()
                        .addInputs("input").setOutputs("out")
                        .addLayer("lstm_a",
-                                new GravesLSTM.Builder().nIn(3).nOut(4)
+                                new LSTM.Builder().nIn(2).nOut(3)
                                        .activation(Activation.TANH).build(),
                                "input")
                        .addVertex("input_rev", new ReverseTimeSeriesVertex("input"), "input")
                        .addLayer("lstm_b",
-                                new GravesLSTM.Builder().nIn(3).nOut(4)
+                                new LSTM.Builder().nIn(2).nOut(3)
                                        .activation(Activation.TANH).build(),
                                "input_rev")
                        .addVertex("lstm_b_rev", new ReverseTimeSeriesVertex("input"), "lstm_b")
-                        .addLayer("out", new RnnOutputLayer.Builder().nIn(4 + 4).nOut(3)
+                        .addLayer("out", new RnnOutputLayer.Builder().nIn(3 + 3).nOut(2)
                                        .activation(Activation.SOFTMAX)
                                        .lossFunction(LossFunctions.LossFunction.MCXENT).build(),
                                "lstm_a", "lstm_b_rev")
@ -577,13 +559,8 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        graph.init();

        Random r = new Random(12345);
-        INDArray input  = Nd4j.rand(new int[] {3, 3, 5});
-        INDArray labels = Nd4j.zeros(3, 3, 5);
-        for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 5; j++) {
-                labels.putScalar(new int[] {i, r.nextInt(3), j}, 1.0);
-            }
-        }
+        INDArray input  = Nd4j.rand(new int[] {2, 2, 4});
+        INDArray labels = TestUtils.randomOneHotTimeSeries(2, 2, 4);

        if (PRINT_RESULTS) {
            System.out.println("testLSTMWithReverseTimeSeriesVertex()");
@ -1171,10 +1148,10 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                        .dist(new NormalDistribution(0, 1))
                        .activation(Activation.TANH).updater(new NoOp()).graphBuilder()
                        .addInputs("in1", "in2")
-                        .addLayer("d0", new GravesLSTM.Builder().nIn(layerSizes).nOut(layerSizes).build(), "in1")
-                        .addLayer("d1", new GravesLSTM.Builder().nIn(layerSizes).nOut(layerSizes).build(), "in2")
+                        .addLayer("d0", new SimpleRnn.Builder().nIn(layerSizes).nOut(layerSizes).build(), "in1")
+                        .addLayer("d1", new SimpleRnn.Builder().nIn(layerSizes).nOut(layerSizes).build(), "in2")
                        .addVertex("stack", new StackVertex(), "d0", "d1")
-                        .addLayer("d2", new GravesLSTM.Builder().nIn(layerSizes).nOut(layerSizes).build(), "stack")
+                        .addLayer("d2", new SimpleRnn.Builder().nIn(layerSizes).nOut(layerSizes).build(), "stack")
                        .addVertex("u1", new UnstackVertex(0, 2), "d2").addVertex("u2", new UnstackVertex(1, 2), "d2")
                        .addLayer("p1", new GlobalPoolingLayer.Builder(PoolingType.AVG).build(), "u1")
                        .addLayer("p2", new GlobalPoolingLayer.Builder(PoolingType.AVG).build(), "u2")
@ -1193,7 +1170,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
        INDArray newParams = Nd4j.rand(new long[]{1, nParams});
        graph.setParams(newParams);

-        int[] mbSizes = new int[] {1, 3, 10};
+        int[] mbSizes = new int[] {1, 2, 3};
        for (int minibatch : mbSizes) {

            INDArray in1 = Nd4j.rand(new int[] {minibatch, layerSizes, 4});
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsMasking.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsMasking.java
@ -25,6 +25,7 @@ import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
@ -85,7 +86,7 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
        mask[3] = new boolean[] {false, false, true, false, true}; //time series classification w/ variable length TS
        mask[4] = new boolean[] {true, true, true, false, true}; //variable length TS

-        int nIn = 4;
+        int nIn = 3;
        int layerSize = 3;

        GradientCheckSimpleScenario[] scenarios = new GradientCheckSimpleScenario[] {
@ -94,23 +95,14 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
                        new GradientCheckSimpleScenario(LossMixtureDensity.builder().gaussians(2).labelWidth(3).build(),
                                        Activation.TANH, 10, 3),
                        new GradientCheckSimpleScenario(LossMixtureDensity.builder().gaussians(2).labelWidth(4).build(),
-                                        Activation.IDENTITY, 12, 4),
-                        new GradientCheckSimpleScenario(LossFunctions.LossFunction.L2.getILossFunction(),
-                                        Activation.SOFTMAX, 2, 2)};
+                                        Activation.IDENTITY, 12, 4)};

        for (GradientCheckSimpleScenario s : scenarios) {

            Random r = new Random(12345L);
-            INDArray input = Nd4j.zeros(1, nIn, timeSeriesLength);
-            for (int m = 0; m < 1; m++) {
-                for (int j = 0; j < nIn; j++) {
-                    for (int k = 0; k < timeSeriesLength; k++) {
-                        input.putScalar(new int[] {m, j, k}, r.nextDouble() - 0.5);
-                    }
-                }
-            }
+            INDArray input = Nd4j.rand(DataType.DOUBLE, 1, nIn, timeSeriesLength).subi(0.5);

-            INDArray labels = Nd4j.zeros(1, s.labelWidth, timeSeriesLength);
+            INDArray labels = Nd4j.zeros(DataType.DOUBLE, 1, s.labelWidth, timeSeriesLength);
            for (int m = 0; m < 1; m++) {
                for (int j = 0; j < timeSeriesLength; j++) {
                    int idx = r.nextInt(s.labelWidth);
@ -127,15 +119,14 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
                }

                MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345L)
-                                .dataType(DataType.DOUBLE)
-                                .list()
-                                .layer(0, new GravesLSTM.Builder().nIn(nIn).nOut(layerSize)
-                                        .dist(new NormalDistribution(0, 1))
-                                                .updater(new NoOp()).build())
-                                .layer(1, new RnnOutputLayer.Builder(s.lf).activation(s.act).nIn(layerSize).nOut(s.nOut)
-                                        .dist(new NormalDistribution(0, 1))
-                                                .updater(new NoOp()).build())
-                                .build();
+                        .dataType(DataType.DOUBLE)
+                        .updater(new NoOp())
+                        .list()
+                        .layer(0, new SimpleRnn.Builder().nIn(nIn).nOut(layerSize)
+                                .weightInit(new NormalDistribution(0, 1)).build())
+                        .layer(1, new RnnOutputLayer.Builder(s.lf).activation(s.act).nIn(layerSize).nOut(s.nOut)
+                                .weightInit(new NormalDistribution(0, 1)).build())
+                        .build();
                MultiLayerNetwork mln = new MultiLayerNetwork(conf);
                mln.init();

@ -156,15 +147,14 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {

        int timeSeriesLength = 5;
        int nIn = 5;
-        int layerSize = 4;
+        int layerSize = 3;
        int nOut = 3;

-        int miniBatchSize = 3;
+        int miniBatchSize = 2;

-        INDArray[] masks = new INDArray[] {null,
-                        Nd4j.create(new double[][] {{1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}),
-                        Nd4j.create(new double[][] {{1, 1, 1, 1, 1}, {1, 1, 1, 1, 0}, {1, 1, 1, 0, 0}}),
-                        Nd4j.create(new double[][] {{1, 1, 1, 1, 1}, {0, 1, 1, 1, 1}, {0, 0, 1, 1, 1}})};
+        INDArray[] masks = new INDArray[] {
+                        Nd4j.create(new double[][] {{1, 1, 1, 1, 1}, {1, 1, 1, 0, 0}}),
+                        Nd4j.create(new double[][] {{1, 1, 1, 1, 1}, {0, 1, 1, 1, 1}})};

        int testNum = 0;
        for (INDArray mask : masks) {
@ -201,7 +191,7 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
            }

            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, mask, mask);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, mask, mask, true, 16);

            assertTrue(gradOK);
            TestUtils.testModelSerialization(mln);
@ -295,9 +285,9 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
        //For RNNs: per-output masking uses 3d masks (same shape as output/labels), as compared to the standard
        // 2d masks (used for per *example* masking)

-        int nIn = 4;
-        int layerSize = 4;
-        int nOut = 4;
+        int nIn = 3;
+        int layerSize = 3;
+        int nOut = 2;

        //1 example, TS length 3
        INDArray mask1 = Nd4j.create(new double[] {1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0}, new int[] {1, nOut, 3}, 'f');
@ -358,7 +348,7 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
                                .dataType(DataType.DOUBLE)
                                .dist(new NormalDistribution(0, 1)).seed(12345)
                                .list()
-                                .layer(0, new GravesLSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
+                                .layer(0, new SimpleRnn.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH)
                                                .build())
                                .layer(1, new RnnOutputLayer.Builder().nIn(layerSize).nOut(nOut).lossFunction(lf)
                                                .activation(a).build())
@ -390,7 +380,7 @@ public class GradientCheckTestsMasking extends BaseDL4JTest {
                                .dataType(DataType.DOUBLE)
                                .dist(new NormalDistribution(0, 2)).seed(12345)
                                .graphBuilder().addInputs("in")
-                                .addLayer("0", new GravesLSTM.Builder().nIn(nIn).nOut(layerSize)
+                                .addLayer("0", new SimpleRnn.Builder().nIn(nIn).nOut(layerSize)
                                                .activation(Activation.TANH).build(), "in")
                                .addLayer("1", new RnnOutputLayer.Builder().nIn(layerSize).nOut(nOut).lossFunction(lf)
                                                .activation(a).build(), "0")
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java
@ -139,11 +139,11 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
    @Test
    public void testGradientLSTMFull() {

-        int timeSeriesLength = 8;
-        int nIn = 7;
-        int layerSize = 9;
-        int nOut = 4;
-        int miniBatchSize = 6;
+        int timeSeriesLength = 4;
+        int nIn = 3;
+        int layerSize = 4;
+        int nOut = 2;
+        int miniBatchSize = 2;

        boolean[] gravesLSTM = new boolean[] {true, false};

@ -162,13 +162,13 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {


            //use l2vals[i] with l1vals[i]
-            double[] l2vals = {0.4, 0.0, 0.4, 0.4};
-            double[] l1vals = {0.0, 0.0, 0.5, 0.0};
-            double[] biasL2 = {0.0, 0.0, 0.0, 0.2};
-            double[] biasL1 = {0.0, 0.0, 0.6, 0.0};
-            Activation[] activFns = {Activation.TANH, Activation.SOFTSIGN, Activation.TANH, Activation.TANH};
-            LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE, LossFunction.MSE, LossFunction.MCXENT};
-            Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH, Activation.IDENTITY, Activation.SOFTMAX};
+            double[] l2vals = {0.4, 0.0};
+            double[] l1vals = {0.0, 0.5};
+            double[] biasL2 = {0.3, 0.0};
+            double[] biasL1 = {0.0, 0.6};
+            Activation[] activFns = {Activation.TANH, Activation.SOFTSIGN};
+            LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE};
+            Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH};

            for (int i = 0; i < l2vals.length; i++) {

@ -218,7 +218,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
                }

                boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 128);

                assertTrue(testName, gradOK);
                TestUtils.testModelSerialization(mln);
@ -233,9 +233,9 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        int[] timeSeriesLength = {1, 5, 1};
        int[] miniBatchSize = {7, 1, 1};

-        int nIn = 7;
-        int layerSize = 9;
-        int nOut = 4;
+        int nIn = 3;
+        int layerSize = 4;
+        int nOut = 2;

        boolean[] gravesLSTM = new boolean[] {true, false};

@ -244,22 +244,9 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
            for (int i = 0; i < timeSeriesLength.length; i++) {

                Random r = new Random(12345L);
-                INDArray input = Nd4j.zeros(miniBatchSize[i], nIn, timeSeriesLength[i]);
-                for (int m = 0; m < miniBatchSize[i]; m++) {
-                    for (int j = 0; j < nIn; j++) {
-                        for (int k = 0; k < timeSeriesLength[i]; k++) {
-                            input.putScalar(new int[] {m, j, k}, r.nextDouble() - 0.5);
-                        }
-                    }
-                }
+                INDArray input = Nd4j.rand(DataType.DOUBLE, miniBatchSize[i], nIn, timeSeriesLength[i]);

-                INDArray labels = Nd4j.zeros(miniBatchSize[i], nOut, timeSeriesLength[i]);
-                for (int m = 0; m < miniBatchSize[i]; m++) {
-                    for (int j = 0; j < timeSeriesLength[i]; j++) {
-                        int idx = r.nextInt(nOut);
-                        labels.putScalar(new int[] {m, idx, j}, 1.0f);
-                    }
-                }
+                INDArray labels = TestUtils.randomOneHotTimeSeries(miniBatchSize[i], nOut, timeSeriesLength[i]);

                Layer layer;
                if (graves) {
@ -296,91 +283,75 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE};
        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here

-        int timeSeriesLength = 4;
+        int timeSeriesLength = 3;
        int nIn = 2;
        int layerSize = 2;
        int nOut = 2;
        int miniBatchSize = 3;

        Random r = new Random(12345L);
-        INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength);
-        for (int i = 0; i < miniBatchSize; i++) {
-            for (int j = 0; j < nIn; j++) {
-                for (int k = 0; k < timeSeriesLength; k++) {
-                    input.putScalar(new int[] {i, j, k}, r.nextDouble() - 0.5);
-                }
-            }
-        }
-
-        INDArray labels = Nd4j.zeros(miniBatchSize, nOut, timeSeriesLength);
-        for (int i = 0; i < miniBatchSize; i++) {
-            for (int j = 0; j < timeSeriesLength; j++) {
-                int idx = r.nextInt(nOut);
-                labels.putScalar(new int[] {i, idx, j}, 1.0f);
-            }
-        }
+        INDArray input = Nd4j.rand(DataType.DOUBLE, miniBatchSize, nIn, timeSeriesLength).subi(0.5);

+        INDArray labels = TestUtils.randomOneHotTimeSeries(miniBatchSize, nOut, timeSeriesLength);

        //use l2vals[i] with l1vals[i]
-        double[] l2vals = {0.4, 0.0, 0.4, 0.4};
-        double[] l1vals = {0.0, 0.0, 0.5, 0.0};
-        double[] biasL2 = {0.0, 0.0, 0.0, 0.2};
-        double[] biasL1 = {0.0, 0.0, 0.6, 0.0};
+        double[] l2vals = {0.4, 0.0};
+        double[] l1vals = {0.5, 0.0};
+        double[] biasL2 = {0.0, 0.2};
+        double[] biasL1 = {0.0, 0.6};

-        for (Activation afn : activFns) {
-            for (int i = 0; i < lossFunctions.length; i++) {
-                for (int k = 0; k < l2vals.length; k++) {
-                    LossFunction lf = lossFunctions[i];
-                    Activation outputActivation = outputActivations[i];
-                    double l2 = l2vals[k];
-                    double l1 = l1vals[k];
+        for (int i = 0; i < lossFunctions.length; i++) {
+            for (int k = 0; k < l2vals.length; k++) {
+                Activation afn = activFns[i];
+                LossFunction lf = lossFunctions[i];
+                Activation outputActivation = outputActivations[i];
+                double l2 = l2vals[k];
+                double l1 = l1vals[k];

-                    NeuralNetConfiguration.Builder conf =
-                                    new NeuralNetConfiguration.Builder();
-                    if (l1 > 0.0)
-                        conf.l1(l1);
-                    if (l2 > 0.0)
-                        conf.l2(l2);
-                    if (biasL2[k] > 0)
-                        conf.l2Bias(biasL2[k]);
-                    if (biasL1[k] > 0)
-                        conf.l1Bias(biasL1[k]);
+                NeuralNetConfiguration.Builder conf =
+                        new NeuralNetConfiguration.Builder();
+                if (l1 > 0.0)
+                    conf.l1(l1);
+                if (l2 > 0.0)
+                    conf.l2(l2);
+                if (biasL2[k] > 0)
+                    conf.l2Bias(biasL2[k]);
+                if (biasL1[k] > 0)
+                    conf.l1Bias(biasL1[k]);

-                    MultiLayerConfiguration mlc = conf.seed(12345L)
-                                    .dataType(DataType.DOUBLE)
-                                    .list().layer(0,
-                                                    new GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize)
-
-                                                                    .dist(new NormalDistribution(0, 1))
-                                                                    .activation(afn).updater(
-                                                                                    Updater.NONE)
-                                                                    .build())
-                                    .layer(1, new RnnOutputLayer.Builder(lf).activation(outputActivation).nIn(layerSize)
-                                                    .nOut(nOut)
-                                                    .dist(new NormalDistribution(0, 1)).updater(new NoOp()).build())
-                                    .build();
+                MultiLayerConfiguration mlc = conf.seed(12345L)
+                        .dataType(DataType.DOUBLE)
+                        .updater(new NoOp())
+                        .list().layer(0,
+                                new GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize)
+                                        .weightInit(new NormalDistribution(0, 1))
+                                        .activation(afn)
+                                        .build())
+                        .layer(1, new RnnOutputLayer.Builder(lf).activation(outputActivation).nIn(layerSize)
+                                .nOut(nOut)
+                                .dist(new NormalDistribution(0, 1)).updater(new NoOp()).build())
+                        .build();


-                    MultiLayerNetwork mln = new MultiLayerNetwork(mlc);
+                MultiLayerNetwork mln = new MultiLayerNetwork(mlc);

-                    mln.init();
+                mln.init();

-                    if (PRINT_RESULTS) {
-                        System.out.println("testGradientGravesBidirectionalLSTMFull() - activationFn=" + afn
-                                        + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2
-                                        + ", l1=" + l1);
-                        for (int j = 0; j < mln.getnLayers(); j++)
-                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
-                    }
-
-                    boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    String msg = "testGradientGravesLSTMFull() - activationFn=" + afn + ", lossFn=" + lf
-                                    + ", outputActivation=" + outputActivation + ", l2=" + l2 + ", l1=" + l1;
-                    assertTrue(msg, gradOK);
-                    TestUtils.testModelSerialization(mln);
+                if (PRINT_RESULTS) {
+                    System.out.println("testGradientGravesBidirectionalLSTMFull() - activationFn=" + afn
+                            + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2
+                            + ", l1=" + l1);
+                    for (int j = 0; j < mln.getnLayers(); j++)
+                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
                }
+
+                boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+                String msg = "testGradientGravesLSTMFull() - activationFn=" + afn + ", lossFn=" + lf
+                        + ", outputActivation=" + outputActivation + ", l2=" + l2 + ", l1=" + l1;
+                assertTrue(msg, gradOK);
+                TestUtils.testModelSerialization(mln);
            }
        }
    }
@ -391,21 +362,14 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        int[] timeSeriesLength = {1, 5, 1};
        int[] miniBatchSize = {7, 1, 1};

-        int nIn = 7;
-        int layerSize = 9;
-        int nOut = 4;
+        int nIn = 3;
+        int layerSize = 4;
+        int nOut = 2;

        for (int i = 0; i < timeSeriesLength.length; i++) {

            Random r = new Random(12345L);
-            INDArray input = Nd4j.zeros(miniBatchSize[i], nIn, timeSeriesLength[i]);
-            for (int m = 0; m < miniBatchSize[i]; m++) {
-                for (int j = 0; j < nIn; j++) {
-                    for (int k = 0; k < timeSeriesLength[i]; k++) {
-                        input.putScalar(new int[] {m, j, k}, r.nextDouble() - 0.5);
-                    }
-                }
-            }
+            INDArray input = Nd4j.rand(DataType.DOUBLE, miniBatchSize[i], nIn, timeSeriesLength[i]).subi(0.5);

            INDArray labels = Nd4j.zeros(miniBatchSize[i], nOut, timeSeriesLength[i]);
            for (int m = 0; m < miniBatchSize[i]; m++) {
@ -431,7 +395,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
            mln.init();

            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 128);

            String msg = "testGradientGravesLSTMEdgeCases() - timeSeriesLength=" + timeSeriesLength[i]
                            + ", miniBatchSize=" + miniBatchSize[i];
@ -445,11 +409,11 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        //Test gradients with CNN -> FF -> LSTM -> RnnOutputLayer
        //time series input/output (i.e., video classification or similar)

-        int nChannelsIn = 3;
-        int inputSize = 10 * 10 * nChannelsIn; //10px x 10px x 3 channels
-        int miniBatchSize = 4;
-        int timeSeriesLength = 10;
-        int nClasses = 3;
+        int nChannelsIn = 2;
+        int inputSize = 6 * 6 * nChannelsIn; //10px x 10px x 3 channels
+        int miniBatchSize = 2;
+        int timeSeriesLength = 4;
+        int nClasses = 2;

        //Generate
        Nd4j.getRandom().setSeed(12345);
@ -467,18 +431,18 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new NoOp()).seed(12345)
                        .dataType(DataType.DOUBLE)
                        .dist(new UniformDistribution(-2, 2)).list()
-                        .layer(0, new ConvolutionLayer.Builder(5, 5).nIn(3).nOut(5).stride(1, 1)
+                        .layer(0, new ConvolutionLayer.Builder(3, 3).nIn(2).nOut(3).stride(1, 1)
                                        .activation(Activation.TANH).build()) //Out: (10-5)/1+1 = 6 -> 6x6x5
                        .layer(1, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX).kernelSize(2, 2)
                                        .stride(1, 1).build()) //Out: (6-2)/1+1 = 5 -> 5x5x5
-                        .layer(2, new DenseLayer.Builder().nIn(5 * 5 * 5).nOut(4).activation(Activation.TANH).build())
+                        .layer(2, new DenseLayer.Builder().nIn(27).nOut(4).activation(Activation.TANH).build())
                        .layer(3, new GravesLSTM.Builder().nIn(4).nOut(3).activation(Activation.TANH).build())
                        .layer(4, new RnnOutputLayer.Builder().lossFunction(LossFunction.MCXENT).nIn(3).nOut(nClasses)
                                        .activation(Activation.SOFTMAX).build())
-                        .setInputType(InputType.convolutional(10, 10, 3)).build();
+                        .setInputType(InputType.convolutional(6, 6, 2)).build();

        //Here: ConvolutionLayerSetup in config builder doesn't know that we are expecting time series input, not standard FF input -> override it here
-        conf.getInputPreProcessors().put(0, new RnnToCnnPreProcessor(10, 10, 3));
+        conf.getInputPreProcessors().put(0, new RnnToCnnPreProcessor(6, 6, 2));

        MultiLayerNetwork mln = new MultiLayerNetwork(conf);
        mln.init();
@ -489,7 +453,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
        }

        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 32);
        assertTrue(gradOK);
        TestUtils.testModelSerialization(mln);
    }
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/RnnGradientChecks.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/RnnGradientChecks.java
@ -68,6 +68,8 @@ public class RnnGradientChecks extends BaseDL4JTest {
            for (boolean inputMask : new boolean[]{false, true}) {
                for (boolean simple : new boolean[]{false, true}) {
                    for(boolean hasLayerNorm: new boolean[]{true, false}) {
+                        if(!simple && hasLayerNorm)
+                            continue;

                        INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
                        INDArray labels = Nd4j.create(mb, nOut, tsLength);
@ -93,6 +95,11 @@ public class RnnGradientChecks extends BaseDL4JTest {
                        }

                        for (Bidirectional.Mode m : modes) {
+                            //Skip 3 of 4 test cases: from 64 to 16, which still should be good coverage
+                            //Note RNG seed - deterministic run-to-run
+                            if(r.nextInt(4) != 0)
+                                continue;
+
                            String name = "mb=" + mb + ", maskType=" + maskType + ", mode=" + m + ", hasLayerNorm=" + hasLayerNorm + ", rnnType="
                                    + (simple ? "SimpleRnn" : "LSTM");

@ -144,6 +151,9 @@ public class RnnGradientChecks extends BaseDL4JTest {
                        for (boolean inputMask : new boolean[]{false, true}) {
                            for (boolean hasLayerNorm : new boolean[]{true, false}) {
                                for (int l = 0; l < l1s.length; l++) {
+                                    //Only run 1 of 5 (on average - note RNG seed for deterministic testing) - 25 of 128 test cases (to minimize test time)
+                                    if(r.nextInt(5) != 0)
+                                        continue;

                                    INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
                                    INDArray labels = Nd4j.create(mb, nOut, tsLength);
@ -217,6 +227,8 @@ public class RnnGradientChecks extends BaseDL4JTest {
            for (boolean inputMask : new boolean[]{false, true}) {
                for (boolean simple : new boolean[]{false, true}) {
                    for (boolean hasLayerNorm : new boolean[]{true, false}) {
+                        if(!simple && hasLayerNorm)
+                            continue;


                        INDArray in = Nd4j.rand(new int[]{mb, nIn, tsLength});
@ -265,7 +277,7 @@ public class RnnGradientChecks extends BaseDL4JTest {
                        net.init();

                        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null);
+                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, in, labels, inMask, null, true, 16);
                        assertTrue(name, gradOK);
                        TestUtils.testModelSerialization(net);
                    }
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/UtilLayerGradientChecks.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/UtilLayerGradientChecks.java
@ -26,6 +26,7 @@ import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.*;
 import org.deeplearning4j.nn.conf.layers.misc.FrozenLayerWithBackprop;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.conf.layers.util.MaskLayer;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
@ -60,9 +61,9 @@ public class UtilLayerGradientChecks extends BaseDL4JTest {
    @Test
    public void testMaskLayer() {
        Nd4j.getRandom().setSeed(12345);
-        int tsLength = 5;
+        int tsLength = 3;

-        for(int minibatch : new int[]{1,8}) {
+        for(int minibatch : new int[]{1,3}) {
            for (int inputRank : new int[]{2, 3, 4}) {
                for (boolean inputMask : new boolean[]{false, true}) {
                    String maskType = (inputMask ? "inputMask" : "none");
@ -74,7 +75,7 @@ public class UtilLayerGradientChecks extends BaseDL4JTest {
                                if(minibatch == 1){
                                    inMask = Nd4j.ones(1,1);
                                } else {
-                                    inMask = Nd4j.create(minibatch, 1);
+                                    inMask = Nd4j.create(DataType.DOUBLE, minibatch, 1);
                                    Nd4j.getExecutioner().exec(new BernoulliDistribution(inMask, 0.5));
                                    int count = inMask.sumNumber().intValue();
                                    assertTrue(count >= 0 && count <= minibatch);   //Sanity check on RNG seed
@ -83,16 +84,16 @@ public class UtilLayerGradientChecks extends BaseDL4JTest {
                            case 4:
                                //Per-example mask (broadcast along all channels/x/y)
                                if(minibatch == 1){
-                                    inMask = Nd4j.ones(1,1, 1, 1);
+                                    inMask = Nd4j.ones(DataType.DOUBLE, 1,1, 1, 1);
                                } else {
-                                    inMask = Nd4j.create(minibatch, 1, 1, 1);
+                                    inMask = Nd4j.create(DataType.DOUBLE, minibatch, 1, 1, 1);
                                    Nd4j.getExecutioner().exec(new BernoulliDistribution(inMask, 0.5));
                                    int count = inMask.sumNumber().intValue();
                                    assertTrue(count >= 0 && count <= minibatch);   //Sanity check on RNG seed
                                }
                                break;
                            case 3:
-                                inMask = Nd4j.ones(minibatch, tsLength);
+                                inMask = Nd4j.ones(DataType.DOUBLE, minibatch, tsLength);
                                for( int i=0; i<minibatch; i++ ){
                                    for( int j=i+1; j<tsLength; j++ ){
                                        inMask.putScalar(i,j,0.0);
@ -108,11 +109,11 @@ public class UtilLayerGradientChecks extends BaseDL4JTest {
                    int[] labelShape;
                    switch (inputRank){
                        case 2:
-                            inShape = new int[]{minibatch, 5};
+                            inShape = new int[]{minibatch, 3};
                            labelShape = inShape;
                            break;
                        case 3:
-                            inShape = new int[]{minibatch, 5, tsLength};
+                            inShape = new int[]{minibatch, 3, tsLength};
                            labelShape = inShape;
                            break;
                        case 4:
@ -134,18 +135,18 @@ public class UtilLayerGradientChecks extends BaseDL4JTest {
                    InputType it;
                    switch (inputRank){
                        case 2:
-                            l1 = new DenseLayer.Builder().nOut(5).build();
-                            l2 = new DenseLayer.Builder().nOut(5).build();
-                            l3 = new OutputLayer.Builder().nOut(5).lossFunction(LossFunctions.LossFunction.MSE)
+                            l1 = new DenseLayer.Builder().nOut(3).build();
+                            l2 = new DenseLayer.Builder().nOut(3).build();
+                            l3 = new OutputLayer.Builder().nOut(3).lossFunction(LossFunctions.LossFunction.MSE)
                                    .activation(Activation.TANH).build();
-                            it = InputType.feedForward(5);
+                            it = InputType.feedForward(3);
                            break;
                        case 3:
-                            l1 = new LSTM.Builder().nIn(5).nOut(5).activation(Activation.TANH).build();
-                            l2 = new LSTM.Builder().nIn(5).nOut(5).activation(Activation.TANH).build();
-                            l3 = new RnnOutputLayer.Builder().nIn(5).nOut(5).lossFunction(LossFunctions.LossFunction.SQUARED_LOSS)
+                            l1 = new SimpleRnn.Builder().nIn(3).nOut(3).activation(Activation.TANH).build();
+                            l2 = new SimpleRnn.Builder().nIn(3).nOut(3).activation(Activation.TANH).build();
+                            l3 = new RnnOutputLayer.Builder().nIn(3).nOut(3).lossFunction(LossFunctions.LossFunction.SQUARED_LOSS)
                                    .activation(Activation.IDENTITY).build();
-                            it = InputType.recurrent(5);
+                            it = InputType.recurrent(3);
                            break;
                        case 4:
                            l1 = new ConvolutionLayer.Builder().nOut(5).convolutionMode(ConvolutionMode.Truncate)
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java
@ -138,28 +138,23 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
    @Test
    public void testVaePretrain() {
        Nd4j.getRandom().setSeed(12345);
-        Activation[] activFns = {Activation.IDENTITY, Activation.TANH, Activation.IDENTITY, Activation.TANH};
-
-        LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MCXENT, LossFunction.MSE, LossFunction.MSE};
-        Activation[] outputActivations = {Activation.SOFTMAX, Activation.SOFTMAX, Activation.TANH, Activation.TANH};
-        Activation[] pzxAfns = {Activation.IDENTITY, Activation.TANH, Activation.IDENTITY, Activation.TANH};
-        Activation[] pxzAfns = {Activation.TANH, Activation.IDENTITY, Activation.TANH, Activation.TANH};
+        Activation[] activFns = {Activation.IDENTITY, Activation.TANH, Activation.SOFTSIGN};
+        Activation[] pzxAfns = {Activation.IDENTITY, Activation.IDENTITY, Activation.TANH};
+        Activation[] pxzAfns = {Activation.TANH, Activation.TANH, Activation.IDENTITY};

        //use l2vals[i] with l1vals[i]
-        double[] l2vals = {0.4, 0.0, 0.4, 0.4};
-        double[] l1vals = {0.0, 0.0, 0.5, 0.0};
-        double[] biasL2 = {0.0, 0.0, 0.0, 0.2};
-        double[] biasL1 = {0.0, 0.0, 0.6, 0.0};
+        double[] l2vals = {0.0, 0.4, 0.4};
+        double[] l1vals = {0.0, 0.5, 0.0};
+        double[] biasL2 = {0.0, 0.0, 0.2};
+        double[] biasL1 = {0.0, 0.6, 0.0};

-        int[][] encoderLayerSizes = new int[][] {{5}, {5}, {5, 6}, {5, 6}};
-        int[][] decoderLayerSizes = new int[][] {{6}, {7, 8}, {6}, {7, 8}};
+        int[][] encoderLayerSizes = new int[][] {{5}, {3, 4}, {3, 4}};
+        int[][] decoderLayerSizes = new int[][] {{4}, {2}, {4, 3}};

-        int[] minibatches = new int[]{1,5,4,3};
+        int[] minibatches = new int[]{1,3,2,3};

        Nd4j.getRandom().setSeed(12345);
        for( int i=0; i<activFns.length; i++ ){
-            LossFunction lf = lossFunctions[i];
-            Activation outputActivation = outputActivations[i];
            double l2 = l2vals[i];
            double l1 = l1vals[i];
            int[] encoderSizes = encoderLayerSizes[i];
@ -214,18 +209,18 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
    @Test
    public void testVaePretrainReconstructionDistributions() {

-        int inOutSize = 6;
+        int inOutSize = 3;

        ReconstructionDistribution[] reconstructionDistributions =
                new ReconstructionDistribution[]{new GaussianReconstructionDistribution(Activation.IDENTITY),
                        new GaussianReconstructionDistribution(Activation.TANH),
                        new BernoulliReconstructionDistribution(Activation.SIGMOID),
                        new CompositeReconstructionDistribution.Builder()
-                                .addDistribution(2,
+                                .addDistribution(1,
                                        new GaussianReconstructionDistribution(
                                                Activation.IDENTITY))
-                                .addDistribution(2, new BernoulliReconstructionDistribution())
-                                .addDistribution(2,
+                                .addDistribution(1, new BernoulliReconstructionDistribution())
+                                .addDistribution(1,
                                        new GaussianReconstructionDistribution(
                                                Activation.TANH))
                                .build(),
@ -248,12 +243,12 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
                    break;
                case 3: //Composite
                    data = Nd4j.create(minibatch, inOutSize);
-                    data.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 2)).assign(Nd4j.rand(minibatch, 2));
+                    data.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 1)).assign(Nd4j.rand(minibatch, 1));
                    Nd4j.getExecutioner()
                            .exec(new BernoulliDistribution(
-                                            data.get(NDArrayIndex.all(), NDArrayIndex.interval(2, 4)), 0.5),
+                                            data.get(NDArrayIndex.all(), NDArrayIndex.interval(1, 2)), 0.5),
                                    Nd4j.getRandom());
-                    data.get(NDArrayIndex.all(), NDArrayIndex.interval(4, 6)).assign(Nd4j.rand(minibatch, 2));
+                    data.get(NDArrayIndex.all(), NDArrayIndex.interval(2, 3)).assign(Nd4j.rand(minibatch, 1));
                    break;
                case 4:
                case 5:
@ -269,7 +264,7 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
                    .seed(12345L).dist(new NormalDistribution(0, 1))
                    .list().layer(0,
                            new VariationalAutoencoder.Builder().nIn(inOutSize).nOut(3)
-                                    .encoderLayerSizes(5).decoderLayerSizes(6)
+                                    .encoderLayerSizes(4).decoderLayerSizes(3)
                                    .pzxActivationFunction(Activation.TANH)
                                    .reconstructionDistribution(
                                            reconstructionDistributions[i])
@ -304,17 +299,15 @@ public class VaeGradientCheckTests extends BaseDL4JTest {

        int minibatch = 2;
        Nd4j.getRandom().setSeed(12345);
-        for (int numSamples : new int[]{1, 3}) {
-
-            //            for (int numSamples : new int[]{10}) {
-            INDArray features = Nd4j.rand(minibatch, 4);
+        for (int numSamples : new int[]{1, 2}) {
+            INDArray features = Nd4j.rand(DataType.DOUBLE, minibatch, 4);

            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().l2(0.2).l1(0.3)
                    .dataType(DataType.DOUBLE)
                    .updater(new NoOp())
                    .seed(12345L).weightInit(WeightInit.XAVIER).list()
-                    .layer(0, new VariationalAutoencoder.Builder().nIn(4).nOut(3).encoderLayerSizes(5, 6)
-                            .decoderLayerSizes(7, 8).pzxActivationFunction(Activation.TANH)
+                    .layer(0, new VariationalAutoencoder.Builder().nIn(4).nOut(3).encoderLayerSizes(2, 3)
+                            .decoderLayerSizes(4, 3).pzxActivationFunction(Activation.TANH)
                            .reconstructionDistribution(
                                    new GaussianReconstructionDistribution(Activation.TANH))
                            .numSamples(numSamples).activation(Activation.TANH)
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/YoloGradientCheckTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/YoloGradientCheckTests.java
@ -72,9 +72,6 @@ public class YoloGradientCheckTests extends BaseDL4JTest {
    @Test
    public void testYoloOutputLayer() {
        int depthIn = 2;
-        int[] minibatchSizes = {1, 3};
-        int[] widths = new int[]{4, 7};
-        int[] heights = new int[]{4, 5};
        int c = 3;
        int b = 3;

@ -83,52 +80,51 @@ public class YoloGradientCheckTests extends BaseDL4JTest {

        Nd4j.getRandom().setSeed(1234567);

+        int[] minibatchSizes = {1, 3};
+        int[] widths = new int[]{4, 7};
+        int[] heights = new int[]{4, 5};
        double[] l1 = new double[]{0.0, 0.3};
        double[] l2 = new double[]{0.0, 0.4};

-        for( int wh = 0; wh<widths.length; wh++ ) {
+        for( int i = 0; i<widths.length; i++ ) {

-            int w = widths[wh];
-            int h = heights[wh];
+            int w = widths[i];
+            int h = heights[i];
+            int mb = minibatchSizes[i];

            Nd4j.getRandom().setSeed(12345);
            INDArray bbPrior = Nd4j.rand(b, 2).muliRowVector(Nd4j.create(new double[]{w, h})).addi(0.1);

-            for (int mb : minibatchSizes) {
-                for (int i = 0; i < l1.length; i++) {
+            Nd4j.getRandom().setSeed(12345);

-                    Nd4j.getRandom().setSeed(12345);
+            INDArray input = Nd4j.rand(new int[]{mb, depthIn, h, w});
+            INDArray labels = yoloLabels(mb, c, h, w);

-                    INDArray input = Nd4j.rand(new int[]{mb, depthIn, h, w});
-                    INDArray labels = yoloLabels(mb, c, h, w);
+            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
+                    .dataType(DataType.DOUBLE)
+                    .updater(new NoOp())
+                    .activation(a)
+                    .l1(l1[i]).l2(l2[i])
+                    .convolutionMode(ConvolutionMode.Same)
+                    .list()
+                    .layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(1, 1)
+                            .nIn(depthIn).nOut(yoloDepth).build())//output: (5-2+0)/1+1 = 4
+                    .layer(new Yolo2OutputLayer.Builder()
+                            .boundingBoxPriors(bbPrior)
+                            .build())
+                    .build();

-                    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
-                            .dataType(DataType.DOUBLE)
-                            .updater(new NoOp())
-                            .activation(a)
-                            .l1(l1[i]).l2(l2[i])
-                            .convolutionMode(ConvolutionMode.Same)
-                            .list()
-                            .layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(1, 1)
-                                    .nIn(depthIn).nOut(yoloDepth).build())//output: (5-2+0)/1+1 = 4
-                            .layer(new Yolo2OutputLayer.Builder()
-                                    .boundingBoxPriors(bbPrior)
-                                    .build())
-                            .build();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                    net.init();
+            String msg = "testYoloOutputLayer() - minibatch = " + mb + ", w=" + w + ", h=" + h + ", l1=" + l1[i] + ", l2=" + l2[i];
+            System.out.println(msg);

-                    String msg = "testYoloOutputLayer() - minibatch = " + mb + ", w=" + w + ", h=" + h + ", l1=" + l1[i] + ", l2=" + l2[i];
-                    System.out.println(msg);
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 100);

-                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    assertTrue(msg, gradOK);
-                    TestUtils.testModelSerialization(net);
-                }
-            }
+            assertTrue(msg, gradOK);
+            TestUtils.testModelSerialization(net);
        }
    }

@ -233,7 +229,7 @@ public class YoloGradientCheckTests extends BaseDL4JTest {
        INDArray l = ds.getLabels();

        boolean ok = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l);
+                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l, null, null, true, 64);

        assertTrue(ok);
        TestUtils.testModelSerialization(net);
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
@ -446,7 +446,7 @@ public class DTypeTests extends BaseDL4JTest {
                            .layer(new ActivationLayer(Activation.LEAKYRELU))
                            .layer(secondLast)
                            .layer(ol)
-                            .setInputType(InputType.convolutionalFlat(28, 28, 1))
+                            .setInputType(InputType.convolutionalFlat(8, 8, 1))
                            .build();

                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
@ -457,16 +457,16 @@ public class DTypeTests extends BaseDL4JTest {
                    assertEquals(msg, networkDtype, net.getFlattenedGradients().dataType());
                    assertEquals(msg, networkDtype, net.getUpdater(true).getStateViewArray().dataType());

-                    INDArray in = Nd4j.rand(networkDtype, 2, 28 * 28);
+                    INDArray in = Nd4j.rand(networkDtype, 2, 8 * 8);
                    INDArray label;
                    if (outputLayer < 3) {
                        label = TestUtils.randomOneHot(2, 10).castTo(networkDtype);
                    } else if (outputLayer == 3) {
                        //CNN loss
-                        label = Nd4j.rand(networkDtype, 2, 3, 28, 28);
+                        label = Nd4j.rand(networkDtype, 2, 3, 8, 8);
                    } else if (outputLayer == 4) {
                        //YOLO
-                        label = Nd4j.ones(networkDtype, 2, 6, 28, 28);
+                        label = Nd4j.ones(networkDtype, 2, 6, 8, 8);
                    } else {
                        throw new IllegalStateException();
                    }
@ -550,7 +550,7 @@ public class DTypeTests extends BaseDL4JTest {
                            .layer(new Upsampling3D.Builder().size(2).build())
                            .layer(secondLast)
                            .layer(ol)
-                            .setInputType(InputType.convolutional3D(Convolution3D.DataFormat.NCDHW, 28, 28, 28, 1))
+                            .setInputType(InputType.convolutional3D(Convolution3D.DataFormat.NCDHW, 8, 8, 8, 1))
                            .build();

                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
@ -561,13 +561,13 @@ public class DTypeTests extends BaseDL4JTest {
                    assertEquals(msg, networkDtype, net.getFlattenedGradients().dataType());
                    assertEquals(msg, networkDtype, net.getUpdater(true).getStateViewArray().dataType());

-                    INDArray in = Nd4j.rand(networkDtype, 2, 1, 28, 28, 28);
+                    INDArray in = Nd4j.rand(networkDtype, 2, 1, 8, 8, 8);
                    INDArray label;
                    if (outputLayer == 0) {
                        label = TestUtils.randomOneHot(2, 10).castTo(networkDtype);
                    } else if (outputLayer == 1) {
                        //CNN3D loss
-                        label = Nd4j.rand(networkDtype, 2, 3, 28, 28, 28);
+                        label = Nd4j.rand(networkDtype, 2, 3, 8, 8, 8);
                    } else if (outputLayer == 2) {
                        label = TestUtils.randomOneHot(2, 10).castTo(networkDtype);
                    } else {
@ -787,15 +787,15 @@ public class DTypeTests extends BaseDL4JTest {
                    switch (outputLayer) {
                        case 0:
                            ol = new RnnOutputLayer.Builder().nOut(5).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build();
-                            secondLast = new LSTM.Builder().nOut(5).activation(Activation.TANH).build();
+                            secondLast = new SimpleRnn.Builder().nOut(5).activation(Activation.TANH).build();
                            break;
                        case 1:
                            ol = new RnnLossLayer.Builder().activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build();
-                            secondLast = new LSTM.Builder().nOut(5).activation(Activation.TANH).build();
+                            secondLast = new SimpleRnn.Builder().nOut(5).activation(Activation.TANH).build();
                            break;
                        case 2:
                            ol = new OutputLayer.Builder().nOut(5).build();
-                            secondLast = new LastTimeStep(new LSTM.Builder().nOut(5).activation(Activation.TANH).build());
+                            secondLast = new LastTimeStep(new SimpleRnn.Builder().nOut(5).activation(Activation.TANH).build());
                            break;
                        default:
                            throw new RuntimeException();
@ -825,12 +825,12 @@ public class DTypeTests extends BaseDL4JTest {
                    assertEquals(msg, networkDtype, net.getFlattenedGradients().dataType());
                    assertEquals(msg, networkDtype, net.getUpdater(true).getStateViewArray().dataType());

-                    INDArray in = Nd4j.rand(networkDtype, 2, 5, 4);
+                    INDArray in = Nd4j.rand(networkDtype, 2, 5, 2);
                    INDArray label;
                    if (outputLayer == 2) {
                        label = TestUtils.randomOneHot(2, 5).castTo(networkDtype);
                    } else {
-                        label = TestUtils.randomOneHotTimeSeries(2, 5, 4).castTo(networkDtype);
+                        label = TestUtils.randomOneHotTimeSeries(2, 5, 2).castTo(networkDtype);
                    }


@ -845,7 +845,7 @@ public class DTypeTests extends BaseDL4JTest {
                    net.setLabels(label);
                    net.computeGradientAndScore();

-                    net.fit(new DataSet(in, label, Nd4j.ones(networkDtype, 2, 4), outputLayer == 2 ? null : Nd4j.ones(networkDtype, 2, 4)));
+                    net.fit(new DataSet(in, label, Nd4j.ones(networkDtype, 2, 2), outputLayer == 2 ? null : Nd4j.ones(networkDtype, 2, 2)));

                    logUsedClasses(net);

@ -1219,9 +1219,9 @@ public class DTypeTests extends BaseDL4JTest {
                                    .addLayer("2", new LocallyConnected1D.Builder().kernelSize(2).nOut(4).build(), "1")
                                    .addLayer("out", new RnnOutputLayer.Builder().nOut(10).build(), "2")
                                    .setOutputs("out")
-                                    .setInputTypes(InputType.recurrent(5, 4));
-                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 5, 4)};
-                            label = TestUtils.randomOneHotTimeSeries(2, 10, 4);
+                                    .setInputTypes(InputType.recurrent(5, 2));
+                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 5, 2)};
+                            label = TestUtils.randomOneHotTimeSeries(2, 10, 2);
                            break;
                        case 1:
                            b.addInputs("in")
@ -1229,8 +1229,8 @@ public class DTypeTests extends BaseDL4JTest {
                                    .addLayer("2", new LocallyConnected2D.Builder().kernelSize(2, 2).nOut(5).build(), "1")
                                    .addLayer("out", new OutputLayer.Builder().nOut(10).build(), "2")
                                    .setOutputs("out")
-                                    .setInputTypes(InputType.convolutional(28, 28, 1));
-                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 1, 28, 28)};
+                                    .setInputTypes(InputType.convolutional(8, 8, 1));
+                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 1, 8, 8)};
                            label = TestUtils.randomOneHot(2, 10).castTo(networkDtype);
                            break;
                        default:
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/capsule/CapsNetMNISTTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/capsule/CapsNetMNISTTest.java
@ -31,6 +31,7 @@ import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
 import org.deeplearning4j.nn.conf.layers.LossLayer;
 import org.deeplearning4j.nn.conf.layers.PrimaryCapsules;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.nd4j.evaluation.classification.Evaluation;
 import org.nd4j.linalg.activations.impl.ActivationSoftmax;
@ -38,6 +39,7 @@ import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.learning.config.Adam;
 import org.nd4j.linalg.lossfunctions.impl.LossNegativeLogLikelihood;

+@Ignore("AB - ignored due to excessive runtime. Keep for manual debugging when required")
 public class CapsNetMNISTTest extends BaseDL4JTest {

    @Override
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/LocallyConnectedLayerTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/LocallyConnectedLayerTest.java
@ -95,7 +95,7 @@ public class LocallyConnectedLayerTest extends BaseDL4JTest {
                .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).l2(2e-4)
                .updater(new Nesterovs(0.9)).dropOut(0.5)
                .list()
-                .layer(new LocallyConnected1D.Builder().kernelSize(8).nIn(3)
+                .layer(new LocallyConnected1D.Builder().kernelSize(4).nIn(3)
                        .stride(1).nOut(16).dropOut(0.5)
                        .convolutionMode(ConvolutionMode.Strict)
                        .setInputSize(28)
@ -104,19 +104,19 @@ public class LocallyConnectedLayerTest extends BaseDL4JTest {
                        .build())
                .layer(new OutputLayer.Builder(LossFunctions.LossFunction.SQUARED_LOSS) //output layer
                        .nOut(10).weightInit(WeightInit.XAVIER).activation(Activation.SOFTMAX).build())
-                .setInputType(InputType.recurrent(3,  28));
+                .setInputType(InputType.recurrent(3,  8));

        MultiLayerConfiguration conf = builder.build();
        MultiLayerNetwork network = new MultiLayerNetwork(conf);
        network.init();

-        INDArray input = Nd4j.ones(10, 3, 28);
+        INDArray input = Nd4j.ones(10, 3, 8);
        INDArray output = network.output(input, false);;
        for (int i = 0; i < 100; i++) { // TODO: this falls flat for 1000 iterations on my machine
            output = network.output(input, false);
        }

-        assertArrayEquals(new long[] {(28 - 8 + 1) * 10, 10}, output.shape());
+        assertArrayEquals(new long[] {(8 - 4 + 1) * 10, 10}, output.shape());
        network.fit(input, output);

    }
@ -159,8 +159,10 @@ public class LocallyConnectedLayerTest extends BaseDL4JTest {
                                    .addLayer("2", new LocallyConnected2D.Builder().kernelSize(2,2).nOut(5).build(), "1")
                                    .addLayer("out", new OutputLayer.Builder().nOut(10).build(), "2")
                                    .setOutputs("out")
-                                    .setInputTypes(InputType.convolutional(28, 28, 1));
-                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 1, 28, 28)};
+//                                    .setInputTypes(InputType.convolutional(28, 28, 1));
+//                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 1, 28, 28)};
+                                    .setInputTypes(InputType.convolutional(8, 8, 1));
+                            in = new INDArray[]{Nd4j.rand(networkDtype, 2, 1, 8, 8)};
                            label = TestUtils.randomOneHot(2, 10).castTo(networkDtype);
                            break;
                        default:
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffConv.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffConv.java
@ -93,8 +93,6 @@ public class TestSameDiffConv extends BaseDL4JTest {
        //Note: to avoid the exporential number of tests here, we'll randomly run every Nth test only.
        //With n=1, m=3 this is 1 out of every 3 tests (on average)
        Random r = new Random(12345);
-        int n = 1;
-        int m = 30;     //1 ot of every 30... 3888 possible combinations here
        for (int minibatch : new int[]{5, 1}) {

            Activation[] afns = new Activation[]{
@ -117,11 +115,8 @@ public class TestSameDiffConv extends BaseDL4JTest {
                                for (int[] dilation : new int[][]{{1, 1}, {2, 2}, {1, 2}}) {
                                    for (ConvolutionMode cm : new ConvolutionMode[]{ConvolutionMode.Truncate, ConvolutionMode.Same}) {
                                        for (Activation a : afns) {
-                                            int i = r.nextInt(m);
-                                            if (i >= n) {
-                                                //Example: n=2, m=3... skip on i=2, run test on i=0, i=1
-                                                continue;
-                                            }
+                                            if(r.nextInt(80) != 0)
+                                                continue;   //1 of 80 on average - of 3888 possible combinations here -> ~49 tests

                                            String msg = "Test " + (count++) + " - minibatch=" + minibatch + ", nIn=" + nIn
                                                    + ", nOut=" + nOut + ", kernel=" + Arrays.toString(kernel) + ", stride="
@ -306,7 +301,7 @@ public class TestSameDiffConv extends BaseDL4JTest {

                        log.info("Starting: " + msg);
                        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l);
+                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, f, l, null, null, true, 50); //Most of weights are in output layer

                        assertTrue(msg, gradOK);

--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java
@ -100,7 +100,7 @@ public class TestOptimizers extends BaseDL4JTest {
        ds.normalizeZeroMeanZeroUnitVariance();

        for (OptimizationAlgorithm oa : toTest) {
-            int nIter = 10;
+            int nIter = 5;
            MultiLayerNetwork network = new MultiLayerNetwork(getMLPConfigIris(oa));
            network.init();
            double score = network.score(ds);
@ -109,7 +109,7 @@ public class TestOptimizers extends BaseDL4JTest {
            if (PRINT_OPT_RESULTS)
                System.out.println("testOptimizersMLP() - " + oa);

-            int nCallsToOptimizer = 30;
+            int nCallsToOptimizer = 10;
            double[] scores = new double[nCallsToOptimizer + 1];
            scores[0] = score;
            for (int i = 0; i < nCallsToOptimizer; i++) {
@ -256,34 +256,6 @@ public class TestOptimizers extends BaseDL4JTest {
        }
    }

-
-    @Test
-    public void testSphereFnOptStochGradDescentMultipleSteps() {
-        //Earlier tests: only do a single line search, though each line search will do multiple iterations
-        // of line search algorithm.
-        //Here, do multiple optimization runs + multiple line search iterations within each run
-        //i.e., gradient is re-calculated at each step/run
-        //Single step tests earlier won't test storing of state between iterations
-
-        testSphereFnMultipleStepsHelper(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT, 100, 5);
-    }
-
-    @Test
-    public void testSphereFnOptLineGradDescentMultipleSteps() {
-        testSphereFnMultipleStepsHelper(OptimizationAlgorithm.LINE_GRADIENT_DESCENT, 100, 5);
-    }
-
-    @Test
-    public void testSphereFnOptCGMultipleSteps() {
-        testSphereFnMultipleStepsHelper(OptimizationAlgorithm.CONJUGATE_GRADIENT, 100, 5);
-    }
-
-    @Test
-    public void testSphereFnOptLBFGSMultipleSteps() {
-        testSphereFnMultipleStepsHelper(OptimizationAlgorithm.LBFGS, 100, 5);
-    }
-
-
    private static void testSphereFnMultipleStepsHelper(OptimizationAlgorithm oa, int nOptIter,
                    int maxNumLineSearchIter) {
        double[] scores = new double[nOptIter + 1];
--- a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/ValidateCuDNN.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/ValidateCuDNN.java
@ -58,8 +58,8 @@ public class ValidateCuDNN extends BaseDL4JTest {

        int numClasses = 10;
        //imageHeight,imageWidth,channels
-        int imageHeight = 240;
-        int imageWidth = 240;
+        int imageHeight = 64;
+        int imageWidth = 64;
        int channels = 3;
        IActivation activation = new ActivationIdentity();
        MultiLayerConfiguration multiLayerConfiguration = new NeuralNetConfiguration.Builder()
@ -68,9 +68,9 @@ public class ValidateCuDNN extends BaseDL4JTest {
                .activation(new ActivationELU())
                .updater(new Nesterovs(1e-3, 0.9))
                .list(
-                        new Convolution2D.Builder().nOut(96)
-                                .kernelSize(11, 11).biasInit(0.0)
-                                .stride(4, 4).build(),
+                        new Convolution2D.Builder().nOut(16)
+                                .kernelSize(4, 4).biasInit(0.0)
+                                .stride(2, 2).build(),
                        new ActivationLayer.Builder().activation(activation).build(),
                        new Pooling2D.Builder()
                                .poolingType(SubsamplingLayer.PoolingType.MAX)
@ -85,12 +85,12 @@ public class ValidateCuDNN extends BaseDL4JTest {
                                .poolingType(SubsamplingLayer.PoolingType.MAX)
                                .kernelSize(3, 3).stride(2, 2)
                                .build(),
-                        new Convolution2D.Builder().nOut(384)
+                        new Convolution2D.Builder().nOut(16)
                                .kernelSize(3, 3).padding(1, 1)
                                .biasInit(0.0)
                                .stride(1, 1).build(),
                        new ActivationLayer.Builder().activation(activation).build(),
-                        new Convolution2D.Builder().nOut(256)
+                        new Convolution2D.Builder().nOut(16)
                                .kernelSize(3, 3).padding(1, 1)
                                .stride(1, 1).build(),
                        new ActivationLayer.Builder().activation(activation).build(),
@ -99,7 +99,7 @@ public class ValidateCuDNN extends BaseDL4JTest {
                                .kernelSize(3, 3).stride(2, 2)
                                .build(),
                        new DenseLayer.Builder()
-                                .nOut(4096)
+                                .nOut(64)
                                .biasInit(0.0)
                                .build(),
                        new ActivationLayer.Builder().activation(activation).build(),
@ -114,8 +114,8 @@ public class ValidateCuDNN extends BaseDL4JTest {
        MultiLayerNetwork net = new MultiLayerNetwork(multiLayerConfiguration);
        net.init();

-        int[] fShape = new int[]{32, channels, imageHeight, imageWidth};
-        int[] lShape = new int[]{32, numClasses};
+        int[] fShape = new int[]{8, channels, imageHeight, imageWidth};
+        int[] lShape = new int[]{8, numClasses};

        List<Class<?>> classesToTest = new ArrayList<>();
        classesToTest.add(ConvolutionLayer.class);
--- a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
@ -144,12 +144,6 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        // (a) activation function
        // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
        // (c) Loss function (with specified output activations)
-        Activation[] activFns = {Activation.SIGMOID, Activation.TANH};
-        boolean[] characteristic = {false, true}; //If true: run some backprop steps first
-
-        LossFunctions.LossFunction[] lossFunctions =
-                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
-        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here

        DataSet ds = new IrisDataSetIterator(150, 150).next();
        ds.normalizeZeroMeanZeroUnitVariance();
@ -161,73 +155,74 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        double[] l1vals = {0.0, 0.0, 0.5, 0.0};
        double[] biasL2 = {0.0, 0.0, 0.0, 0.2};
        double[] biasL1 = {0.0, 0.0, 0.6, 0.0};
+        Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.ELU, Activation.SOFTPLUS};
+        boolean[] characteristic = {false, true, false, true}; //If true: run some backprop steps first

-        for (Activation afn : activFns) {
-            for (boolean doLearningFirst : characteristic) {
-                for (int i = 0; i < lossFunctions.length; i++) {
-                    for (int k = 0; k < l2vals.length; k++) {
-                        LossFunctions.LossFunction lf = lossFunctions[i];
-                        Activation outputActivation = outputActivations[i];
-                        double l2 = l2vals[k];
-                        double l1 = l1vals[k];
+        LossFunctions.LossFunction[] lossFunctions =
+                {LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE, LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
+        Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH, Activation.SOFTMAX, Activation.IDENTITY}; //i.e., lossFunctions[i] used with outputActivations[i] here

-                        MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
-                                .dataType(DataType.DOUBLE)
-                                .l2(l2).l1(l1).l2Bias(biasL2[k]).l1Bias(biasL1[k])
-                                .optimizationAlgo(
-                                        OptimizationAlgorithm.CONJUGATE_GRADIENT)
-                                .seed(12345L).list()
-                                .layer(0, new ConvolutionLayer.Builder(new int[]{1, 1}).nIn(1).nOut(6)
-                                        .cudnnAllowFallback(false)
-                                        .weightInit(WeightInit.XAVIER).activation(afn)
-                                        .updater(new NoOp()).build())
-                                .layer(1, new OutputLayer.Builder(lf).activation(outputActivation).nOut(3)
-                                        .weightInit(WeightInit.XAVIER).updater(new NoOp()).build())
+        for( int i=0; i<l2vals.length; i++ ){
+            Activation afn = activFns[i];
+            boolean doLearningFirst = characteristic[i];
+            LossFunctions.LossFunction lf = lossFunctions[i];
+            Activation outputActivation = outputActivations[i];
+            double l2 = l2vals[i];
+            double l1 = l1vals[i];

-                                .setInputType(InputType.convolutionalFlat(1, 4, 1));
+            MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
+                    .dataType(DataType.DOUBLE)
+                    .l2(l2).l1(l1).l2Bias(biasL2[i]).l1Bias(biasL1[i])
+                    .optimizationAlgo(
+                            OptimizationAlgorithm.CONJUGATE_GRADIENT)
+                    .seed(12345L).list()
+                    .layer(0, new ConvolutionLayer.Builder(new int[]{1, 1}).nIn(1).nOut(6)
+                            .weightInit(WeightInit.XAVIER).activation(afn)
+                            .updater(new NoOp()).build())
+                    .layer(1, new OutputLayer.Builder(lf).activation(outputActivation).nOut(3)
+                            .weightInit(WeightInit.XAVIER).updater(new NoOp()).build())

-                        MultiLayerConfiguration conf = builder.build();
+                    .setInputType(InputType.convolutionalFlat(1, 4, 1));

-                        MultiLayerNetwork mln = new MultiLayerNetwork(conf);
-                        mln.init();
-                        String testName = new Object() {
-                        }.getClass().getEnclosingMethod().getName();
+            MultiLayerConfiguration conf = builder.build();

-                        if (doLearningFirst) {
-                            //Run a number of iterations of learning
-                            mln.setInput(ds.getFeatures());
-                            mln.setLabels(ds.getLabels());
-                            mln.computeGradientAndScore();
-                            double scoreBefore = mln.score();
-                            for (int j = 0; j < 10; j++)
-                                mln.fit(ds);
-                            mln.computeGradientAndScore();
-                            double scoreAfter = mln.score();
-                            //Can't test in 'characteristic mode of operation' if not learning
-                            String msg = testName
-                                    + "- score did not (sufficiently) decrease during learning - activationFn="
-                                    + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
-                                    + ", doLearningFirst=" + doLearningFirst + " (before=" + scoreBefore
-                                    + ", scoreAfter=" + scoreAfter + ")";
-                            assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
-                        }
+            MultiLayerNetwork mln = new MultiLayerNetwork(conf);
+            mln.init();
+            String testName = new Object() {
+            }.getClass().getEnclosingMethod().getName();

-                        if (PRINT_RESULTS) {
-                            System.out.println(testName + "- activationFn=" + afn + ", lossFn=" + lf
-                                    + ", outputActivation=" + outputActivation + ", doLearningFirst="
-                                    + doLearningFirst);
-                            for (int j = 0; j < mln.getnLayers(); j++)
-                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
-                        }
-
-                        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                        assertTrue(gradOK);
-                        TestUtils.testModelSerialization(mln);
-                    }
-                }
+            if (doLearningFirst) {
+                //Run a number of iterations of learning
+                mln.setInput(ds.getFeatures());
+                mln.setLabels(ds.getLabels());
+                mln.computeGradientAndScore();
+                double scoreBefore = mln.score();
+                for (int j = 0; j < 10; j++)
+                    mln.fit(ds);
+                mln.computeGradientAndScore();
+                double scoreAfter = mln.score();
+                //Can't test in 'characteristic mode of operation' if not learning
+                String msg = testName
+                        + "- score did not (sufficiently) decrease during learning - activationFn="
+                        + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+                        + ", doLearningFirst=" + doLearningFirst + " (before=" + scoreBefore
+                        + ", scoreAfter=" + scoreAfter + ")";
+                assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
            }
+
+            if (PRINT_RESULTS) {
+                System.out.println(testName + "- activationFn=" + afn + ", lossFn=" + lf
+                        + ", outputActivation=" + outputActivation + ", doLearningFirst="
+                        + doLearningFirst);
+                for (int j = 0; j < mln.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(gradOK);
+            TestUtils.testModelSerialization(mln);
        }
    }

@ -375,57 +370,43 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int[] padding = {0, 0};
        int size = 2;

-        String[] activations = {"sigmoid", "tanh"};
-        SubsamplingLayer.PoolingType[] poolingTypes =
-                new SubsamplingLayer.PoolingType[]{SubsamplingLayer.PoolingType.MAX,
-                        SubsamplingLayer.PoolingType.AVG, SubsamplingLayer.PoolingType.PNORM};
+        for (int minibatchSize : minibatchSizes) {
+            INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-        for (String afn : activations) {
-            for (SubsamplingLayer.PoolingType poolingType : poolingTypes) {
-                for (int minibatchSize : minibatchSizes) {
-                    INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
-                    INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                    for (int i = 0; i < minibatchSize; i++) {
-                        labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                    }
+            MultiLayerConfiguration conf =
+                    new NeuralNetConfiguration.Builder()
+                            .dataType(DataType.DOUBLE)
+                            .updater(new NoOp())
+                            .dist(new NormalDistribution(0, 1))
+                            .list().layer(new ConvolutionLayer.Builder(kernel,
+                            stride, padding).nIn(inputDepth)
+                            .nOut(3).build())//output: (5-2+0)/1+1 = 4
+                            .layer(new Upsampling2D.Builder().size(size).build()) //output: 4*2 =8 -> 8x8x3
+                            .layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                                    .activation(Activation.SOFTMAX).nIn(8 * 8 * 3)
+                                    .nOut(4).build())
+                            .setInputType(InputType.convolutionalFlat(height, width,
+                                    inputDepth))
+                            .build();

-                    MultiLayerConfiguration conf =
-                            new NeuralNetConfiguration.Builder()
-                                    .dataType(DataType.DOUBLE)
-                                    .updater(new NoOp())
-                                    .dist(new NormalDistribution(0, 1))
-                                    .list().layer(new ConvolutionLayer.Builder(kernel,
-                                    stride, padding).nIn(inputDepth)
-                                    .cudnnAllowFallback(false)
-                                    .nOut(3).build())//output: (5-2+0)/1+1 = 4
-                                    .layer(new Upsampling2D.Builder().size(size).build()) //output: 4*2 =8 -> 8x8x3
-                                    .layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nIn(8 * 8 * 3)
-                                            .nOut(4).build())
-                                    .setInputType(InputType.convolutionalFlat(height, width,
-                                            inputDepth))
-                                    .build();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                    net.init();
+            String msg = "Upsampling - minibatch=" + minibatchSize;

-                    String msg = "PoolingType=" + poolingType + ", minibatch=" + minibatchSize + ", activationFn="
-                            + afn;
-
-                    if (PRINT_RESULTS) {
-                        System.out.println(msg);
-                        for (int j = 0; j < net.getnLayers(); j++)
-                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
-                    }
-
-                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    assertTrue(msg, gradOK);
-
-                    TestUtils.testModelSerialization(net);
-                }
+            if (PRINT_RESULTS) {
+                System.out.println(msg);
+                for (int j = 0; j < net.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -646,63 +627,56 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testCnnSamePaddingMode() {
        int nOut = 2;

-        int[] minibatchSizes = {1, 3};
+        int[] minibatchSizes = {1, 3, 3, 2, 1, 2};
+        int[] heights = new int[]{4, 5, 6, 5, 4, 4}; //Same padding mode: insensitive to exact input size...
+        int[] kernelSizes = new int[]{2, 3, 2, 3, 2, 3};
+        int[] inputDepths = {1, 2, 4, 3, 2, 3};
+
        int width = 5;
-        int[] heights = new int[]{4, 5, 6}; //Same padding mode: insensitive to exact input size...
-        int[] kernelSizes = new int[]{2, 3};
-        int[] inputDepths = {1, 2, 4};

        Nd4j.getRandom().setSeed(12345);

-        for (int inputDepth : inputDepths) {
-            for (int minibatchSize : minibatchSizes) {
-                for (int height : heights) {
-                    for (int k : kernelSizes) {
+        for( int i=0; i<minibatchSizes.length; i++ ){
+            int inputDepth = inputDepths[i];
+            int minibatchSize = minibatchSizes[i];
+            int height = heights[i];
+            int k = kernelSizes[i];

-                        INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
-                        INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                        for (int i = 0; i < minibatchSize; i++) {
-                            labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                        }
+            INDArray input = Nd4j.rand(minibatchSize, width * height * inputDepth);
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-                        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
-                                .dataType(DataType.DOUBLE)
-                                .updater(new NoOp())
-                                .activation(Activation.TANH).convolutionMode(Same).list()
-                                .layer(0, new ConvolutionLayer.Builder().name("layer 0").kernelSize(k, k)
-                                        .cudnnAllowFallback(false)
-                                        .stride(1, 1).padding(0, 0).nIn(inputDepth).nOut(2).build())
-                                .layer(1, new SubsamplingLayer.Builder()
-                                        .poolingType(SubsamplingLayer.PoolingType.MAX).kernelSize(k, k)
-                                        .cudnnAllowFallback(false)
-                                        .stride(1, 1).padding(0, 0).build())
-                                .layer(2, new ConvolutionLayer.Builder().nIn(2).nOut(2).kernelSize(k, k)
-                                        .cudnnAllowFallback(false)
-                                        .stride(1, 1).padding(0, 0).build())
-                                .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                        .activation(Activation.SOFTMAX).nOut(nOut).build())
-                                .setInputType(InputType.convolutionalFlat(height, width, inputDepth)).build();
+            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
+                    .dataType(DataType.DOUBLE)
+                    .updater(new NoOp())
+                    .activation(Activation.TANH).convolutionMode(Same).list()
+                    .layer(0, new ConvolutionLayer.Builder().name("layer 0").kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).nIn(inputDepth).nOut(2).build())
+                    .layer(1, new SubsamplingLayer.Builder()
+                            .poolingType(SubsamplingLayer.PoolingType.MAX).kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).build())
+                    .layer(2, new ConvolutionLayer.Builder().nIn(2).nOut(2).kernelSize(k, k)
+                            .stride(1, 1).padding(0, 0).build())
+                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                            .activation(Activation.SOFTMAX).nOut(nOut).build())
+                    .setInputType(InputType.convolutionalFlat(height, width, inputDepth)).build();

-                        MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                        net.init();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                        for (int i = 0; i < net.getLayers().length; i++) {
-                            System.out.println("nParams, layer " + i + ": " + net.getLayer(i).numParams());
-                        }
-
-                        String msg = "Minibatch=" + minibatchSize + ", inDepth=" + inputDepth + ", height=" + height
-                                + ", width=" + width + ", kernelSize=" + k;
-                        System.out.println(msg);
-
-                        boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                        assertTrue(msg, gradOK);
-
-                        TestUtils.testModelSerialization(net);
-                    }
-                }
+            for (int j = 0; j < net.getLayers().length; j++) {
+                System.out.println("nParams, layer " + j + ": " + net.getLayer(j).numParams());
            }
+
+            String msg = "Minibatch=" + minibatchSize + ", inDepth=" + inputDepth + ", height=" + height
+                    + ", kernelSize=" + k;
+            System.out.println(msg);
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -732,12 +706,10 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                            }

                            Layer convLayer = new ConvolutionLayer.Builder().name("layer 0").kernelSize(k, k)
-                                    .cudnnAllowFallback(false)
                                    .stride(stride, stride).padding(0, 0).nIn(inputDepth).nOut(2).build();

                            Layer poolLayer = new SubsamplingLayer.Builder()
                                    .poolingType(SubsamplingLayer.PoolingType.MAX).kernelSize(k, k)
-                                    .cudnnAllowFallback(false)
                                    .stride(stride, stride).padding(0, 0).build();

                            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
@ -765,7 +737,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {

                            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input,
-                                    labels);
+                                    labels, null, null, true, 128);

                            assertTrue(msg, gradOK);

@ -783,69 +755,66 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        Nd4j.getRandom().setSeed(12345);
        int nOut = 4;

-        int[] minibatchSizes = {1, 3};
+
        int width = 6;
        int height = 6;
-        int[] inputDepths = {1, 3};
+

        int[] kernel = {2, 2};
        int[] stride = {1, 1};
        int[] padding = {0, 0};

+        int[] minibatchSizes = {1, 3, 2};
+        int[] inputDepths = {1, 3, 2};
        int[][] zeroPadLayer = new int[][]{{0, 0, 0, 0}, {1, 1, 0, 0}, {2, 2, 2, 2}};

-        for (int inputDepth : inputDepths) {
-            for (int minibatchSize : minibatchSizes) {
-                INDArray input = Nd4j.rand(new int[]{minibatchSize, inputDepth, height, width});
-                INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                for (int i = 0; i < minibatchSize; i++) {
-                    labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                }
-                for (int[] zeroPad : zeroPadLayer) {
+        for( int i=0; i<minibatchSizes.length; i++ ){
+            int minibatchSize = minibatchSizes[i];
+            int inputDepth = inputDepths[i];
+            int[] zeroPad = zeroPadLayer[i];
+            INDArray input = Nd4j.rand(DataType.DOUBLE, new int[]{minibatchSize, inputDepth, height, width});
+            INDArray labels = TestUtils.randomOneHot(minibatchSize, nOut);

-                    MultiLayerConfiguration conf =
-                            new NeuralNetConfiguration.Builder().updater(new NoOp())
-                                    .dataType(DataType.DOUBLE)
-                                    .dist(new NormalDistribution(0, 1)).list()
-                                    .layer(0, new ConvolutionLayer.Builder(kernel, stride, padding)
-                                            .cudnnAllowFallback(false)
-                                            .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
-                                    .layer(1, new ZeroPaddingLayer.Builder(zeroPad).build()).layer(2,
-                                    new ConvolutionLayer.Builder(kernel, stride,
-                                            padding).nIn(3).nOut(3).cudnnAllowFallback(false).build())//output: (6-2+0)/1+1 = 5
-                                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nOut(4).build())
-                                    .setInputType(InputType.convolutional(height, width, inputDepth))
-                                    .build();
+            MultiLayerConfiguration conf =
+                    new NeuralNetConfiguration.Builder().updater(new NoOp())
+                            .dataType(DataType.DOUBLE)
+                            .dist(new NormalDistribution(0, 1)).list()
+                            .layer(0, new ConvolutionLayer.Builder(kernel, stride, padding)
+                                    .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                            .layer(1, new ZeroPaddingLayer.Builder(zeroPad).build()).layer(2,
+                            new ConvolutionLayer.Builder(kernel, stride,
+                                    padding).nIn(3).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                            .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                                    .activation(Activation.SOFTMAX).nOut(4).build())
+                            .setInputType(InputType.convolutional(height, width, inputDepth))
+                            .build();

-                    MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                    net.init();
+            MultiLayerNetwork net = new MultiLayerNetwork(conf);
+            net.init();

-                    //Check zero padding activation shape
-                    org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer zpl =
-                            (org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer) net.getLayer(1);
-                    val expShape = new long[]{minibatchSize, inputDepth, height + zeroPad[0] + zeroPad[1],
-                            width + zeroPad[2] + zeroPad[3]};
-                    INDArray out = zpl.activate(input, false, LayerWorkspaceMgr.noWorkspaces());
-                    assertArrayEquals(expShape, out.shape());
+            //Check zero padding activation shape
+            org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer zpl =
+                    (org.deeplearning4j.nn.layers.convolution.ZeroPaddingLayer) net.getLayer(1);
+            val expShape = new long[]{minibatchSize, inputDepth, height + zeroPad[0] + zeroPad[1],
+                    width + zeroPad[2] + zeroPad[3]};
+            INDArray out = zpl.activate(input, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(expShape, out.shape());

-                    String msg = "minibatch=" + minibatchSize + ", channels=" + inputDepth + ", zeroPad = "
-                            + Arrays.toString(zeroPad);
+            String msg = "minibatch=" + minibatchSize + ", channels=" + inputDepth + ", zeroPad = "
+                    + Arrays.toString(zeroPad);

-                    if (PRINT_RESULTS) {
-                        System.out.println(msg);
-                        for (int j = 0; j < net.getnLayers(); j++)
-                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
-                    }
-
-                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                    assertTrue(msg, gradOK);
-
-                    TestUtils.testModelSerialization(net);
-                }
+            if (PRINT_RESULTS) {
+                System.out.println(msg);
+                for (int j = 0; j < net.getnLayers(); j++)
+                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
            }
+
+            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+            assertTrue(msg, gradOK);
+
+            TestUtils.testModelSerialization(net);
        }
    }

@ -853,12 +822,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testDeconvolution2D() {
        int nOut = 2;

-        int[] minibatchSizes = new int[]{1, 4, 1, 4, 1, 1, 2, 1};
-        int[] kernelSizes = new int[]{1, 1, 3, 3, 1, 1, 3, 3};
-        int[] strides = {1, 1, 1, 1, 2, 2, 2, 2};
-        int[] dilation = {1, 2, 2, 1, 1, 1, 2, 2};
-        Activation[] activations = new Activation[]{Activation.SIGMOID, Activation.TANH, Activation.TANH, Activation.TANH, Activation.TANH,  Activation.SIGMOID, Activation.SIGMOID, Activation.SIGMOID};
-        ConvolutionMode[] cModes = new ConvolutionMode[]{Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate};
+        int[] minibatchSizes = new int[]{1, 3, 3, 1, 3};
+        int[] kernelSizes = new int[]{1, 1, 1, 3, 3};
+        int[] strides = {1, 1, 2, 2, 2};
+        int[] dilation = {1, 2, 1, 2, 2};
+        Activation[] activations = new Activation[]{Activation.SIGMOID, Activation.TANH, Activation.SIGMOID, Activation.SIGMOID, Activation.SIGMOID};
+        ConvolutionMode[] cModes = new ConvolutionMode[]{Same, Same, Truncate, Truncate, Truncate};
        int width = 7;
        int height = 7;
        int inputDepth = 3;
@ -888,23 +857,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                    .updater(new NoOp())
                    .activation(act)
                    .list()
-                    .layer(new Deconvolution2D.Builder().name("deconvolution_2D_layer-0")
-                            .cudnnAllowFallback(false)
-                            .kernelSize(1, 1)
-                            .stride(1, 1)
-                            .dilation(0, 0)
-                            .convolutionMode(cm)
-                            .nIn(inputDepth)
-                            .nOut(inputDepth)
-                            .build())
                    .layer(new Deconvolution2D.Builder().name("deconvolution_2D_layer")
-                            .cudnnAllowFallback(false)
                            .kernelSize(k, k)
                            .stride(s, s)
                            .dilation(d, d)
                            .convolutionMode(cm)
-                            .nIn(inputDepth).nOut(nOut)
-                            .build());
+                            .nIn(inputDepth).nOut(nOut).build());

            MultiLayerConfiguration conf = b.layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                    .activation(Activation.SOFTMAX).nOut(nOut).build())
@ -922,7 +880,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 100);

            assertTrue(msg, gradOK);

@ -936,16 +894,16 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int depthMultiplier = 2;
        int nOut = nIn * depthMultiplier;

-        int width = 8;
-        int height = 8;
+        int width = 5;
+        int height = 5;

        Nd4j.getRandom().setSeed(12345);

-        int[] ks = new int[]{1,3,1,3,1,3,1,3};
-        int[] ss = new int[]{1,1,2,2,1,1,2,2};
+        int[] ks = new int[]{1,3,3,1,3};
+        int[] ss = new int[]{1,1,1,2,2};
        ConvolutionMode[] cms = new ConvolutionMode[]{
-                Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate};
-        int[] mb = new int[]{1,1,3,3,3,1,3,3};
+                Truncate, Truncate, Truncate, Truncate, Truncate};
+        int[] mb = new int[]{1,1,1,3,3};

        for( int t=0; t<ks.length; t++ ){

@ -987,11 +945,11 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            }

            String msg = " - mb=" + minibatchSize + ", k="
-                    + k + ", s=" + s + ", cm=" + cm;
+                    + k + ", nIn=" + nIn + ", depthMul=" + depthMultiplier + ", s=" + s + ", cm=" + cm;
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 256);

            assertTrue(msg, gradOK);

@ -1004,20 +962,20 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testSeparableConv2D() {
        int nOut = 2;

-        int width = 8;
-        int height = 8;
+        int[] minibatchSizes = new int[]{1, 3};
+        int width = 6;
+        int height = 6;
        int inputDepth = 3;

        Nd4j.getRandom().setSeed(12345);

-        int[] ks = new int[]{1,3,1,3,1,3,1,3};
-        int[] ss = new int[]{1,1,2,2,1,1,2,2};
-        int[] ds = new int[]{1,1,1,1,2,2,2,2};
-        ConvolutionMode[] cms = new ConvolutionMode[]{
-                Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate, Truncate};
-        int[] mb = new int[]{1,1,3,3,3,1,3,3};
+        int[] ks = new int[]{1, 3, 3, 1, 3};
+        int[] ss = new int[]{1, 1, 1, 2, 2};
+        int[] ds = new int[]{1, 1, 2, 2, 2};
+        ConvolutionMode[] cms = new ConvolutionMode[]{Truncate, Truncate, Truncate, Truncate, Truncate};
+        int[] mb = new int[]{1, 1, 1, 3, 3};

-        for( int t=0; t<ks.length; t++ ){
+        for (int t = 0; t < ks.length; t++) {

            int k = ks[t];
            int s = ss[t];
@ -1041,10 +999,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                    .activation(Activation.TANH)
                    .convolutionMode(cm)
                    .list()
-                    .layer(new Convolution2D.Builder().kernelSize(1, 1).stride(1, 1)
-                            .nIn(inputDepth).nOut(inputDepth).build())
                    .layer(new SeparableConvolution2D.Builder().name("Separable conv 2D layer")
-                            .cudnnAllowFallback(false)
                            .kernelSize(k, k)
                            .stride(s, s)
                            .dilation(d, d)
@ -1067,7 +1022,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
            System.out.println(msg);

            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 50);    //Most params are in output layer

            assertTrue(msg, gradOK);

@ -1079,21 +1034,21 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    public void testCnnDilated() {
        int nOut = 2;

-        int minibatchSize = 3;
+        int minibatchSize = 2;
        int width = 8;
        int height = 8;
-        int inputDepth = 3;
+        int inputDepth = 2;

        Nd4j.getRandom().setSeed(12345);

-        boolean[] sub = new boolean[]{true,false,true,false,true,false,true,false};
-        int[] stride = new int[]{1,1,2,2,1,1,2,2};
-        int[] kernel = new int[]{2,2,2,2,3,3,3,3};
-        int[] ds = new int[]{2,3,3,2,2,3,3,2};
-        ConvolutionMode[] cms = new ConvolutionMode[]{Same, Same, Same, Truncate, Truncate, Truncate, Same, Truncate};
+        boolean[] sub = new boolean[]{true, true, false, true, false};
+        int[] stride = new int[]{1, 1, 1, 2, 2};
+        int[] kernel = new int[]{2, 3, 3, 3, 3};
+        int[] ds = new int[]{2, 2, 3, 3, 2};
+        ConvolutionMode[] cms = new ConvolutionMode[]{Same, Truncate, Truncate, Same, Truncate};


-        for(int t=0; t<sub.length; t++ ){
+        for (int t = 0; t < sub.length; t++) {

            boolean subsampling = sub[t];
            int s = stride[t];
@ -1119,14 +1074,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                            .kernelSize(k, k)
                            .stride(s, s)
                            .dilation(d, d)
-                            .cudnnAllowFallback(false)
                            .nIn(inputDepth).nOut(2).build());
            if (subsampling) {
                b.layer(new SubsamplingLayer.Builder()
                        .poolingType(SubsamplingLayer.PoolingType.MAX)
                        .kernelSize(k, k)
                        .stride(s, s)
-                        .cudnnAllowFallback(false)
                        .dilation(d, d)
                        .build());
            } else {
@ -1134,7 +1087,6 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                        .kernelSize(k, k)
                        .stride(s, s)
                        .dilation(d, d)
-                        .cudnnAllowFallback(false)
                        .build());
            }

@ -1166,7 +1118,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
    @Test
    public void testCropping2DLayer() {
        Nd4j.getRandom().setSeed(12345);
-        int nOut = 4;
+        int nOut = 2;

        int[] minibatchSizes = {1, 3};
        int width = 12;
@ -1177,7 +1129,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
        int[] stride = {1, 1};
        int[] padding = {0, 0};

-        int[][] cropTestCases = new int[][]{{0, 0, 0, 0}, {1, 1, 0, 0}, {2, 2, 2, 2}, {1,2,3,4}};
+        int[][] cropTestCases = new int[][]{{0, 0, 0, 0}, {1, 1, 0, 0}, {2, 2, 2, 2}, {1, 2, 3, 4}};

        for (int inputDepth : inputDepths) {
            for (int minibatchSize : minibatchSizes) {
@ -1195,12 +1147,12 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                                    .convolutionMode(ConvolutionMode.Same)
                                    .weightInit(new NormalDistribution(0, 1)).list()
                                    .layer(new ConvolutionLayer.Builder(kernel, stride, padding)
-                                            .cudnnAllowFallback(false)
-                                            .nIn(inputDepth).nOut(3).build())//output: (6-2+0)/1+1 = 5
+                                            .nIn(inputDepth).nOut(2).build())//output: (6-2+0)/1+1 = 5
                                    .layer(new Cropping2D(crop))
-                                    .layer(new ConvolutionLayer.Builder(kernel, stride,padding).nIn(3).nOut(3).cudnnAllowFallback(false).build())
+                                    .layer(new ConvolutionLayer.Builder(kernel, stride, padding).nIn(2).nOut(2).build())
+                                    .layer(new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.AVG).kernelSize(3, 3).stride(3, 3).build())
                                    .layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                            .activation(Activation.SOFTMAX).nOut(4).build())
+                                            .activation(Activation.SOFTMAX).nOut(nOut).build())
                                    .setInputType(InputType.convolutional(height, width, inputDepth))
                                    .build();

@ -1225,7 +1177,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                    }

                    boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                            DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 160);

                    assertTrue(msg, gradOK);

--- a/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/gradientcheck/CuDNNGradientChecks.java
+++ b/deeplearning4j/deeplearning4j-cuda/src/test/java/org/deeplearning4j/gradientcheck/CuDNNGradientChecks.java
@ -369,10 +369,10 @@ public class CuDNNGradientChecks extends BaseDL4JTest {
    public void testLSTM() throws Exception {

        Nd4j.getRandom().setSeed(12345);
-        int minibatch = 10;
-        int inputSize = 8;
-        int lstmLayerSize = 7;
-        int timeSeriesLength = 6;
+        int minibatch = 4;
+        int inputSize = 3;
+        int lstmLayerSize = 4;
+        int timeSeriesLength = 3;
        int nOut = 4;
        INDArray input = Nd4j.rand(new int[] {minibatch, inputSize, timeSeriesLength});
        INDArray labels = Nd4j.zeros(minibatch, nOut, timeSeriesLength);
@ -417,7 +417,7 @@ public class CuDNNGradientChecks extends BaseDL4JTest {
        }

        boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels, null, null, true, 32);

        assertTrue(gradOK);
    }
@ -489,10 +489,7 @@ public class CuDNNGradientChecks extends BaseDL4JTest {
        int width = 8;
        int height = 8;
        int inputDepth = 3;
-        int[] kernelSizes = new int[]{2, 3};
-        int[] strides = {1, 2};
-        int[] dilation = {2, 3};
-        ConvolutionMode[] cModes = new ConvolutionMode[]{ConvolutionMode.Truncate, ConvolutionMode.Same};
+

        Nd4j.getRandom().setSeed(12345);

@ -502,85 +499,88 @@ public class CuDNNGradientChecks extends BaseDL4JTest {
        Field f2 = org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer.class.getDeclaredField("helper");
        f2.setAccessible(true);

+        int[] kernelSizes = new int[]{2, 3, 2};
+        int[] strides = {1, 2, 2};
+        int[] dilation = {2, 3, 2};
+        ConvolutionMode[] cModes = new ConvolutionMode[]{ConvolutionMode.Truncate, ConvolutionMode.Same, ConvolutionMode.Truncate};
+
        for (boolean subsampling : new boolean[]{false, true}) {
-            for (int k : kernelSizes) {
-                for (int s : strides) {
-                    for (int d : dilation) {
-                        for (ConvolutionMode cm : cModes) {
+            for (int t = 0; t < kernelSizes.length; t++) {
+                int k = kernelSizes[t];
+                int s = strides[t];
+                int d = dilation[t];
+                ConvolutionMode cm = cModes[t];

-                            //Use larger input with larger dilation values (to avoid invalid config)
-                            int w = d * width;
-                            int h = d * height;
+                //Use larger input with larger dilation values (to avoid invalid config)
+                int w = d * width;
+                int h = d * height;

-                            INDArray input = Nd4j.rand(minibatchSize, w * h * inputDepth);
-                            INDArray labels = Nd4j.zeros(minibatchSize, nOut);
-                            for (int i = 0; i < minibatchSize; i++) {
-                                labels.putScalar(new int[]{i, i % nOut}, 1.0);
-                            }
-
-                            NeuralNetConfiguration.ListBuilder b = new NeuralNetConfiguration.Builder().seed(12345)
-                                    .dataType(DataType.DOUBLE)
-                                    .updater(new NoOp())
-                                    .activation(Activation.TANH).convolutionMode(cm).list()
-                                    .layer(new ConvolutionLayer.Builder().name("layer 0")
-                                            .kernelSize(k, k)
-                                            .stride(s, s)
-                                            .dilation(d, d)
-                                            .nIn(inputDepth).nOut(2).build());
-                            if (subsampling) {
-                                b.layer(new SubsamplingLayer.Builder()
-                                        .poolingType(SubsamplingLayer.PoolingType.MAX)
-                                        .kernelSize(k, k)
-                                        .stride(s, s)
-                                        .dilation(d, d)
-                                        .build());
-                            } else {
-                                b.layer(new ConvolutionLayer.Builder().nIn(2).nOut(2)
-                                        .kernelSize(k, k)
-                                        .stride(s, s)
-                                        .dilation(d, d)
-                                        .build());
-                            }
-
-                            MultiLayerConfiguration conf = b.layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
-                                    .activation(Activation.SOFTMAX).nOut(nOut).build())
-                                    .setInputType(InputType.convolutionalFlat(h, w, inputDepth)).build();
-
-                            MultiLayerNetwork net = new MultiLayerNetwork(conf);
-                            net.init();
-
-                            org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c0 =
-                                    (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer)net.getLayer(0);
-                            ConvolutionHelper ch0 = (ConvolutionHelper) f.get(c0);
-                            assertTrue(ch0 instanceof CudnnConvolutionHelper);
-
-                            if(subsampling){
-                                org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer s1 =
-                                        (org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer)net.getLayer(1);
-                                SubsamplingHelper sh1 = (SubsamplingHelper) f2.get(s1);
-                                assertTrue(sh1 instanceof SubsamplingHelper);
-                            } else {
-                                org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c1 =
-                                        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer)net.getLayer(1);
-                                ConvolutionHelper ch1 = (ConvolutionHelper) f.get(c1);
-                                assertTrue(ch1 instanceof CudnnConvolutionHelper);
-                            }
-
-                            for (int i = 0; i < net.getLayers().length; i++) {
-                                System.out.println("nParams, layer " + i + ": " + net.getLayer(i).numParams());
-                            }
-
-                            String msg = (subsampling ? "subsampling" : "conv") + " - mb=" + minibatchSize + ", k="
-                                    + k + ", s=" + s + ", d=" + d + ", cm=" + cm;
-                            System.out.println(msg);
-
-                            boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
-                                    DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
-
-                            assertTrue(msg, gradOK);
-                        }
-                    }
+                INDArray input = Nd4j.rand(minibatchSize, w * h * inputDepth);
+                INDArray labels = Nd4j.zeros(minibatchSize, nOut);
+                for (int i = 0; i < minibatchSize; i++) {
+                    labels.putScalar(new int[]{i, i % nOut}, 1.0);
                }
+
+                NeuralNetConfiguration.ListBuilder b = new NeuralNetConfiguration.Builder().seed(12345)
+                        .dataType(DataType.DOUBLE)
+                        .updater(new NoOp())
+                        .activation(Activation.TANH).convolutionMode(cm).list()
+                        .layer(new ConvolutionLayer.Builder().name("layer 0")
+                                .kernelSize(k, k)
+                                .stride(s, s)
+                                .dilation(d, d)
+                                .nIn(inputDepth).nOut(2).build());
+                if (subsampling) {
+                    b.layer(new SubsamplingLayer.Builder()
+                            .poolingType(SubsamplingLayer.PoolingType.MAX)
+                            .kernelSize(k, k)
+                            .stride(s, s)
+                            .dilation(d, d)
+                            .build());
+                } else {
+                    b.layer(new ConvolutionLayer.Builder().nIn(2).nOut(2)
+                            .kernelSize(k, k)
+                            .stride(s, s)
+                            .dilation(d, d)
+                            .build());
+                }
+
+                MultiLayerConfiguration conf = b.layer(new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
+                        .activation(Activation.SOFTMAX).nOut(nOut).build())
+                        .setInputType(InputType.convolutionalFlat(h, w, inputDepth)).build();
+
+                MultiLayerNetwork net = new MultiLayerNetwork(conf);
+                net.init();
+
+                org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c0 =
+                        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) net.getLayer(0);
+                ConvolutionHelper ch0 = (ConvolutionHelper) f.get(c0);
+                assertTrue(ch0 instanceof CudnnConvolutionHelper);
+
+                if (subsampling) {
+                    org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer s1 =
+                            (org.deeplearning4j.nn.layers.convolution.subsampling.SubsamplingLayer) net.getLayer(1);
+                    SubsamplingHelper sh1 = (SubsamplingHelper) f2.get(s1);
+                    assertTrue(sh1 instanceof SubsamplingHelper);
+                } else {
+                    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer c1 =
+                            (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) net.getLayer(1);
+                    ConvolutionHelper ch1 = (ConvolutionHelper) f.get(c1);
+                    assertTrue(ch1 instanceof CudnnConvolutionHelper);
+                }
+
+                for (int i = 0; i < net.getLayers().length; i++) {
+                    System.out.println("nParams, layer " + i + ": " + net.getLayer(i).numParams());
+                }
+
+                String msg = (subsampling ? "subsampling" : "conv") + " - mb=" + minibatchSize + ", k="
+                        + k + ", s=" + s + ", d=" + d + ", cm=" + cm;
+                System.out.println(msg);
+
+                boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
+                        DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
+
+                assertTrue(msg, gradOK);
            }
        }
    }
@ -588,7 +588,7 @@ public class CuDNNGradientChecks extends BaseDL4JTest {

    @Test
    public void testDropout() {
-        int minibatch = 3;
+        int minibatch = 2;

        for (boolean cnn : new boolean[]{false, true}) {
            Nd4j.getRandom().setSeed(12345);
@ -605,15 +605,15 @@ public class CuDNNGradientChecks extends BaseDL4JTest {
                    .list();

            if (cnn) {
-                builder.layer(new ConvolutionLayer.Builder().kernelSize(3, 3).stride(1, 1).nOut(3).build());
-                builder.layer(new ConvolutionLayer.Builder().kernelSize(3, 3).stride(1, 1).nOut(3).build());
-                builder.setInputType(InputType.convolutional(8, 8, 3));
+                builder.layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(2, 2).nOut(2).build());
+                builder.layer(new ConvolutionLayer.Builder().kernelSize(2, 2).stride(2, 2).nOut(2).build());
+                builder.setInputType(InputType.convolutional(8, 8, 2));
            } else {
-                builder.layer(new DenseLayer.Builder().nOut(12).build());
-                builder.layer(new DenseLayer.Builder().nOut(12).build());
-                builder.setInputType(InputType.feedForward(8));
+                builder.layer(new DenseLayer.Builder().nOut(8).build());
+                builder.layer(new DenseLayer.Builder().nOut(8).build());
+                builder.setInputType(InputType.feedForward(6));
            }
-            builder.layer(new OutputLayer.Builder().nOut(10).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build());
+            builder.layer(new OutputLayer.Builder().nOut(3).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build());
            MultiLayerConfiguration conf = builder.build();

            MultiLayerNetwork mln = new MultiLayerNetwork(conf);
@ -621,11 +621,11 @@ public class CuDNNGradientChecks extends BaseDL4JTest {

            INDArray f;
            if (cnn) {
-                f = Nd4j.rand(new int[]{minibatch, 3, 8, 8}).muli(10).subi(5);
+                f = Nd4j.rand(new int[]{minibatch, 2, 8, 8}).muli(10).subi(5);
            } else {
-                f = Nd4j.rand(minibatch, 8).muli(10).subi(5);
+                f = Nd4j.rand(minibatch, 6).muli(10).subi(5);
            }
-            INDArray l = TestUtils.randomOneHot(minibatch, 10);
+            INDArray l = TestUtils.randomOneHot(minibatch, 3);

            mln.output(f, true);

--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/BaseDL4JTest.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/BaseDL4JTest.java
@ -0,0 +1,140 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.deeplearning4j.graph;
+
+import lombok.extern.slf4j.Slf4j;
+import org.bytedeco.javacpp.Pointer;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.TestName;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.memory.MemoryWorkspace;
+import org.nd4j.linalg.api.ops.executioner.OpExecutioner;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.profiler.ProfilerConfig;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+@Slf4j
+public class BaseDL4JTest {
+
+    @Rule
+    public TestName name = new TestName();
+
+    protected long startTime;
+    protected int threadCountBefore;
+
+    /**
+     * Override this to set the profiling mode for the tests defined in the child class
+     */
+    public OpExecutioner.ProfilingMode getProfilingMode(){
+        return OpExecutioner.ProfilingMode.SCOPE_PANIC;
+    }
+
+    /**
+     * Override this to set the datatype of the tests defined in the child class
+     */
+    public DataType getDataType(){
+        return DataType.DOUBLE;
+    }
+
+    public DataType getDefaultFPDataType(){
+        return getDataType();
+    }
+
+    @Before
+    public void beforeTest(){
+        log.info("{}.{}", getClass().getSimpleName(), name.getMethodName());
+        Nd4j.getExecutioner().setProfilingMode(getProfilingMode());
+        Nd4j.getExecutioner().setProfilingConfig(ProfilerConfig.builder().build());
+        Nd4j.setDefaultDataTypes(getDataType(), getDefaultFPDataType());
+        startTime = System.currentTimeMillis();
+        threadCountBefore = ManagementFactory.getThreadMXBean().getThreadCount();
+    }
+
+    @After
+    public void afterTest(){
+        //Attempt to keep workspaces isolated between tests
+        Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
+        MemoryWorkspace currWS = Nd4j.getMemoryManager().getCurrentWorkspace();
+        Nd4j.getMemoryManager().setCurrentWorkspace(null);
+        if(currWS != null){
+            //Not really safe to continue testing under this situation... other tests will likely fail with obscure
+            // errors that are hard to track back to this
+            log.error("Open workspace leaked from test! Exiting - {}, isOpen = {} - {}", currWS.getId(), currWS.isScopeActive(), currWS);
+            System.exit(1);
+        }
+
+        StringBuilder sb = new StringBuilder();
+        long maxPhys = Pointer.maxPhysicalBytes();
+        long maxBytes = Pointer.maxBytes();
+        long currPhys = Pointer.physicalBytes();
+        long currBytes = Pointer.totalBytes();
+
+        long jvmTotal = Runtime.getRuntime().totalMemory();
+        long jvmMax = Runtime.getRuntime().maxMemory();
+
+        int threadsAfter = ManagementFactory.getThreadMXBean().getThreadCount();
+
+        long duration = System.currentTimeMillis() - startTime;
+        sb.append(getClass().getSimpleName()).append(".").append(name.getMethodName())
+                .append(": ").append(duration).append(" ms")
+                .append(", threadCount: (").append(threadCountBefore).append("->").append(threadsAfter).append(")")
+                .append(", jvmTotal=").append(jvmTotal)
+                .append(", jvmMax=").append(jvmMax)
+                .append(", totalBytes=").append(currBytes).append(", maxBytes=").append(maxBytes)
+                .append(", currPhys=").append(currPhys).append(", maxPhys=").append(maxPhys);
+
+        List<MemoryWorkspace> ws = Nd4j.getWorkspaceManager().getAllWorkspacesForCurrentThread();
+        if(ws != null && ws.size() > 0){
+            long currSize = 0;
+            for(MemoryWorkspace w : ws){
+                currSize += w.getCurrentSize();
+            }
+            if(currSize > 0){
+                sb.append(", threadWSSize=").append(currSize)
+                        .append(" (").append(ws.size()).append(" WSs)");
+            }
+        }
+
+
+        Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+        Object o = p.get("cuda.devicesInformation");
+        if(o instanceof List){
+            List<Map<String,Object>> l = (List<Map<String, Object>>) o;
+            if(l.size() > 0) {
+
+                sb.append(" [").append(l.size())
+                        .append(" GPUs: ");
+
+                for (int i = 0; i < l.size(); i++) {
+                    Map<String,Object> m = l.get(i);
+                    if(i > 0)
+                        sb.append(",");
+                    sb.append("(").append(m.get("cuda.freeMemory")).append(" free, ")
+                            .append(m.get("cuda.totalMemory")).append(" total)");
+                }
+                sb.append("]");
+            }
+        }
+        log.info(sb.toString());
+    }
+}
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoading.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.graph.data;

 import org.apache.commons.lang3.ArrayUtils;
+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.deeplearning4j.graph.api.Edge;
 import org.deeplearning4j.graph.api.IGraph;
 import org.deeplearning4j.graph.data.impl.DelimitedEdgeLineProcessor;
@ -32,7 +33,7 @@ import java.util.List;

 import static org.junit.Assert.*;

-public class TestGraphLoading {
+public class TestGraphLoading extends BaseDL4JTest {

    @Test(timeout = 10000L)
    public void testEdgeListGraphLoading() throws IOException {
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/data/TestGraphLoadingWeighted.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.graph.data;

 import org.apache.commons.lang3.ArrayUtils;
+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.deeplearning4j.graph.api.Edge;
 import org.deeplearning4j.graph.api.IGraph;
 import org.deeplearning4j.graph.data.impl.WeightedEdgeLineProcessor;
@ -32,7 +33,7 @@ import java.util.List;
 import static junit.framework.TestCase.assertTrue;
 import static org.junit.Assert.assertEquals;

-public class TestGraphLoadingWeighted {
+public class TestGraphLoadingWeighted extends BaseDL4JTest {

    @Test(timeout = 10000L)
    public void testWeightedDirected() throws IOException {
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/graph/TestGraph.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.graph.graph;

 import org.apache.commons.lang3.ArrayUtils;
+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.deeplearning4j.graph.api.*;
 import org.deeplearning4j.graph.data.GraphLoader;
 import org.deeplearning4j.graph.iterator.RandomWalkIterator;
@ -34,7 +35,7 @@ import static junit.framework.TestCase.assertTrue;
 import static org.junit.Assert.*;


-public class TestGraph {
+public class TestGraph extends BaseDL4JTest {

    @Test(timeout = 10000L)
    public void testSimpleGraph() {
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/DeepWalkGradientCheck.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.graph.models.deepwalk;

+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.deeplearning4j.graph.data.GraphLoader;
 import org.deeplearning4j.graph.graph.Graph;
 import org.deeplearning4j.graph.iterator.GraphWalkIterator;
@ -35,7 +36,7 @@ import java.util.Arrays;

 import static org.junit.Assert.*;

-public class DeepWalkGradientCheck {
+public class DeepWalkGradientCheck extends BaseDL4JTest {

    public static final double epsilon = 1e-8;
    public static final double MAX_REL_ERROR = 1e-3;
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestDeepWalk.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.graph.models.deepwalk;

 import org.apache.commons.io.FilenameUtils;
+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.deeplearning4j.graph.api.Edge;
 import org.deeplearning4j.graph.api.IGraph;
 import org.deeplearning4j.graph.data.GraphLoader;
@ -42,7 +43,7 @@ import java.util.Random;

 import static org.junit.Assert.*;

-public class TestDeepWalk {
+public class TestDeepWalk extends BaseDL4JTest {

    @Rule
    public TemporaryFolder testDir = new TemporaryFolder();
@ -214,7 +215,7 @@ public class TestDeepWalk {

        Nd4j.getRandom().setSeed(12345);

-        int nEpochs = 50;
+        int nEpochs = 5;

        //Set up network
        DeepWalk<String, String> deepWalk =
--- a/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java
+++ b/deeplearning4j/deeplearning4j-graph/src/test/java/org/deeplearning4j/graph/models/deepwalk/TestGraphHuffman.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.graph.models.deepwalk;

+import org.deeplearning4j.graph.BaseDL4JTest;
 import org.junit.Test;

 import java.util.Arrays;
@ -24,7 +25,7 @@ import java.util.Set;

 import static org.junit.Assert.*;

-public class TestGraphHuffman {
+public class TestGraphHuffman extends BaseDL4JTest {

    @Test(timeout = 10000L)
    public void testGraphHuffman() {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/BaseDL4JTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/BaseDL4JTest.java
@ -0,0 +1,140 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.deeplearning4j.nn.modelimport.keras;
+
+import lombok.extern.slf4j.Slf4j;
+import org.bytedeco.javacpp.Pointer;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.TestName;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.memory.MemoryWorkspace;
+import org.nd4j.linalg.api.ops.executioner.OpExecutioner;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.profiler.ProfilerConfig;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+@Slf4j
+public class BaseDL4JTest {
+
+    @Rule
+    public TestName name = new TestName();
+
+    protected long startTime;
+    protected int threadCountBefore;
+
+    /**
+     * Override this to set the profiling mode for the tests defined in the child class
+     */
+    public OpExecutioner.ProfilingMode getProfilingMode(){
+        return OpExecutioner.ProfilingMode.SCOPE_PANIC;
+    }
+
+    /**
+     * Override this to set the datatype of the tests defined in the child class
+     */
+    public DataType getDataType(){
+        return DataType.DOUBLE;
+    }
+
+    public DataType getDefaultFPDataType(){
+        return getDataType();
+    }
+
+    @Before
+    public void beforeTest(){
+        log.info("{}.{}", getClass().getSimpleName(), name.getMethodName());
+        Nd4j.getExecutioner().setProfilingMode(getProfilingMode());
+        Nd4j.getExecutioner().setProfilingConfig(ProfilerConfig.builder().build());
+        Nd4j.setDefaultDataTypes(getDataType(), getDefaultFPDataType());
+        startTime = System.currentTimeMillis();
+        threadCountBefore = ManagementFactory.getThreadMXBean().getThreadCount();
+    }
+
+    @After
+    public void afterTest(){
+        //Attempt to keep workspaces isolated between tests
+        Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
+        MemoryWorkspace currWS = Nd4j.getMemoryManager().getCurrentWorkspace();
+        Nd4j.getMemoryManager().setCurrentWorkspace(null);
+        if(currWS != null){
+            //Not really safe to continue testing under this situation... other tests will likely fail with obscure
+            // errors that are hard to track back to this
+            log.error("Open workspace leaked from test! Exiting - {}, isOpen = {} - {}", currWS.getId(), currWS.isScopeActive(), currWS);
+            System.exit(1);
+        }
+
+        StringBuilder sb = new StringBuilder();
+        long maxPhys = Pointer.maxPhysicalBytes();
+        long maxBytes = Pointer.maxBytes();
+        long currPhys = Pointer.physicalBytes();
+        long currBytes = Pointer.totalBytes();
+
+        long jvmTotal = Runtime.getRuntime().totalMemory();
+        long jvmMax = Runtime.getRuntime().maxMemory();
+
+        int threadsAfter = ManagementFactory.getThreadMXBean().getThreadCount();
+
+        long duration = System.currentTimeMillis() - startTime;
+        sb.append(getClass().getSimpleName()).append(".").append(name.getMethodName())
+                .append(": ").append(duration).append(" ms")
+                .append(", threadCount: (").append(threadCountBefore).append("->").append(threadsAfter).append(")")
+                .append(", jvmTotal=").append(jvmTotal)
+                .append(", jvmMax=").append(jvmMax)
+                .append(", totalBytes=").append(currBytes).append(", maxBytes=").append(maxBytes)
+                .append(", currPhys=").append(currPhys).append(", maxPhys=").append(maxPhys);
+
+        List<MemoryWorkspace> ws = Nd4j.getWorkspaceManager().getAllWorkspacesForCurrentThread();
+        if(ws != null && ws.size() > 0){
+            long currSize = 0;
+            for(MemoryWorkspace w : ws){
+                currSize += w.getCurrentSize();
+            }
+            if(currSize > 0){
+                sb.append(", threadWSSize=").append(currSize)
+                        .append(" (").append(ws.size()).append(" WSs)");
+            }
+        }
+
+
+        Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+        Object o = p.get("cuda.devicesInformation");
+        if(o instanceof List){
+            List<Map<String,Object>> l = (List<Map<String, Object>>) o;
+            if(l.size() > 0) {
+
+                sb.append(" [").append(l.size())
+                        .append(" GPUs: ");
+
+                for (int i = 0; i < l.size(); i++) {
+                    Map<String,Object> m = l.get(i);
+                    if(i > 0)
+                        sb.append(",");
+                    sb.append("(").append(m.get("cuda.freeMemory")).append(" free, ")
+                            .append(m.get("cuda.totalMemory")).append(" total)");
+                }
+                sb.append("]");
+            }
+        }
+        log.info(sb.toString());
+    }
+}
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java
@ -38,7 +38,7 @@ import java.util.concurrent.atomic.AtomicInteger;

 import static org.junit.Assert.*;

-public class MiscTests {
+public class MiscTests extends BaseDL4JTest {

    @Rule
    public TemporaryFolder testDir = new TemporaryFolder();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
@ -24,6 +24,7 @@ import org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator;

 import org.deeplearning4j.nn.layers.recurrent.LSTM;
 import org.deeplearning4j.nn.layers.recurrent.LastTimeStepLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
@ -54,7 +55,7 @@ import java.util.List;
 import static junit.framework.TestCase.assertTrue;

@Ignore("AB - 2019/05/27 - NPE on CUDA only. Ignored to get all passing baseline on master; see issue 7657")
-public class FullModelComparisons {
+public class FullModelComparisons extends BaseDL4JTest {

    ClassLoader classLoader = FullModelComparisons.class.getClassLoader();

--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/JsonTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/JsonTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.configurations;

 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.KerasFlattenRnnPreprocessor;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.PermutePreprocessor;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.ReshapePreprocessor;
@ -26,7 +27,7 @@ import org.junit.Test;

 import static org.junit.Assert.assertEquals;

-public class JsonTest {
+public class JsonTest extends BaseDL4JTest {

    @Test
    public void testJsonPreprocessors() throws Exception {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java
@ -20,6 +20,7 @@ import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
@ -36,7 +37,7 @@ import java.io.InputStream;
 */

@Slf4j
-public class Keras1ModelConfigurationTest {
+public class Keras1ModelConfigurationTest extends BaseDL4JTest {

    private ClassLoader classLoader = getClass().getClassLoader();

--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java
@ -21,6 +21,7 @@ import lombok.val;
 import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
@ -49,7 +50,7 @@ import static org.junit.Assert.assertArrayEquals;
 */

@Slf4j
-public class Keras2ModelConfigurationTest {
+public class Keras2ModelConfigurationTest extends BaseDL4JTest {

    ClassLoader classLoader = getClass().getClassLoader();

--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.configurations;

 import org.deeplearning4j.nn.conf.distribution.*;
 import org.deeplearning4j.nn.conf.layers.DenseLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import java.util.Map;

 import static org.junit.Assert.assertEquals;

-public class KerasInitilizationTest {
+public class KerasInitilizationTest extends BaseDL4JTest {

    private double minValue = -0.2;
    private double maxValue = 0.2;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.configurations;

 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
@ -38,10 +39,7 @@ import static org.junit.Assert.assertNotNull;
 * Test import of Keras models.
 */
@Slf4j
-public class KerasModelImportTest {
-
-    ClassLoader classLoader = KerasModelImportTest.class.getClassLoader();
-
+public class KerasModelImportTest extends BaseDL4JTest {

    @Test
    public void testH5WithoutTensorflowScope() throws Exception {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasCustomLayerTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasCustomLayerTest.java
@ -20,6 +20,7 @@ import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.io.FileUtils;
 import org.deeplearning4j.common.resources.DL4JResources;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
 import org.deeplearning4j.nn.modelimport.keras.layers.custom.KerasLRN;
@ -41,7 +42,7 @@ import java.net.URL;
 * @author Justin Long (crockpotveggies)
 */
@Slf4j
-public class KerasCustomLayerTest {
+public class KerasCustomLayerTest extends BaseDL4JTest {

    @Rule
    public TemporaryFolder testDir = new TemporaryFolder();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.e2e;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLambdaLayer;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
@ -44,7 +45,7 @@ import java.nio.file.StandardCopyOption;
 *
 * @author Max Pumperla
 */
-public class KerasLambdaTest {
+public class KerasLambdaTest extends BaseDL4JTest {

    @Rule
    public TemporaryFolder testDir = new TemporaryFolder();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java
@ -32,10 +32,7 @@ import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.layers.recurrent.LSTM;
 import org.deeplearning4j.nn.layers.recurrent.LastTimeStepLayer;
 import org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer;
-import org.deeplearning4j.nn.modelimport.keras.Hdf5Archive;
-import org.deeplearning4j.nn.modelimport.keras.KerasModel;
-import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
-import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
+import org.deeplearning4j.nn.modelimport.keras.*;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelBuilder;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelUtils;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
@ -78,7 +75,7 @@ import static org.junit.Assert.assertTrue;
 * @author dave@skymind.io, Max Pumperla
 */
@Slf4j
-public class KerasModelEndToEndTest {
+public class KerasModelEndToEndTest extends BaseDL4JTest {
    private static final String GROUP_ATTR_INPUTS = "inputs";
    private static final String GROUP_ATTR_OUTPUTS = "outputs";
    private static final String GROUP_PREDICTIONS = "predictions";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.e2e;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
@ -50,7 +51,7 @@ import java.nio.file.StandardCopyOption;
 * @author Max Pumperla
 */
@Slf4j
-public class KerasYolo9000PredictTest {
+public class KerasYolo9000PredictTest extends BaseDL4JTest {

    private static final String DL4J_MODEL_FILE_NAME = ".";
    private static ImagePreProcessingScaler IMAGE_PREPROCESSING_SCALER = new ImagePreProcessingScaler(0, 1);
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.e2e;

 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasSpaceToDepth;
@ -47,7 +48,7 @@ import java.nio.file.StandardCopyOption;
 * @author Max Pumperla
 */
@Slf4j
-public class KerasYolo9000Test {
+public class KerasYolo9000Test extends BaseDL4JTest {

    private static final String TEMP_MODEL_FILENAME = "tempModel";
    private static final String H5_EXTENSION = ".h5";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasLeakyReLUTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasLeakyReLUTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.advanced.activation;

 import org.deeplearning4j.nn.conf.layers.ActivationLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasLeakyReLUTest {
+public class KerasLeakyReLUTest extends BaseDL4JTest {

    private Keras1LayerConfiguration conf1 = new Keras1LayerConfiguration();
    private Keras2LayerConfiguration conf2 = new Keras2LayerConfiguration();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasPReLUTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasPReLUTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.advanced.activation;

 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.PReLULayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -35,7 +36,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasPReLUTest {
+public class KerasPReLUTest extends BaseDL4JTest {

    private Keras1LayerConfiguration conf1 = new Keras1LayerConfiguration();
    private Keras2LayerConfiguration conf2 = new Keras2LayerConfiguration();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasThresholdedReLUTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activation/KerasThresholdedReLUTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.advanced.activation;

 import org.deeplearning4j.nn.conf.layers.ActivationLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasThresholdedReLUTest {
+public class KerasThresholdedReLUTest extends BaseDL4JTest {

    private Keras1LayerConfiguration conf1 = new Keras1LayerConfiguration();
    private Keras2LayerConfiguration conf2 = new Keras2LayerConfiguration();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.Convolution1DLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -40,7 +41,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasAtrousConvolution1DTest {
+public class KerasAtrousConvolution1DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -38,7 +39,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasAtrousConvolution2DTest {
+public class KerasAtrousConvolution2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution1DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.Convolution1DLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -37,7 +38,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasConvolution1DTest {
+public class KerasConvolution1DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasConvolution2DTest {
+public class KerasConvolution2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -43,7 +44,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasConvolution3DTest {
+public class KerasConvolution3DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.convolutional.Cropping1D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -32,7 +33,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasCropping1DTest {
+public class KerasCropping1DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "cropping_1D_layer";
    private final int CROPPING = 2;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping2DTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.convolutional.Cropping2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -32,7 +33,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasCropping2DTest {
+public class KerasCropping2DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "cropping_2D_layer";
    private final int[] CROPPING = new int[]{2, 3};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.convolutional.Cropping2D;
 import org.deeplearning4j.nn.conf.layers.convolutional.Cropping3D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -34,7 +35,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasCropping3DTest {
+public class KerasCropping3DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "cropping_3D_layer";
    private final int[] CROPPING = new int[]{2, 3, 5};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDeconvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDeconvolution2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.Deconvolution2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasDeconvolution2DTest {
+public class KerasDeconvolution2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.DepthwiseConvolution2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -42,7 +43,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasDepthwiseConvolution2DTest {
+public class KerasDepthwiseConvolution2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasSeparableConvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasSeparableConvolution2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.SeparableConvolution2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasSeparableConvolution2DTest {
+public class KerasSeparableConvolution2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.Upsampling1D;
 import org.deeplearning4j.nn.conf.layers.Upsampling2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -35,7 +36,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasUpsampling1DTest {
+public class KerasUpsampling1DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "upsampling_1D_layer";
    private int size = 4;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.Upsampling2D;
 import org.deeplearning4j.nn.conf.layers.ZeroPadding1DLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -35,7 +36,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasUpsampling2DTest {
+public class KerasUpsampling2DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "upsampling_2D_layer";
    private int[] size = new int[]{2, 2};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling3DTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.Upsampling3D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -33,7 +34,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasUpsampling3DTest {
+public class KerasUpsampling3DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "upsampling_3D_layer";
    private int[] size = new int[]{2, 2, 2};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding1DTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.ZeroPadding1DLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasZeroPadding1DTest {
+public class KerasZeroPadding1DTest extends BaseDL4JTest {

    private Keras1LayerConfiguration conf1 = new Keras1LayerConfiguration();
    private Keras2LayerConfiguration conf2 = new Keras2LayerConfiguration();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding2DTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.ZeroPaddingLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -32,7 +33,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasZeroPadding2DTest {
+public class KerasZeroPadding2DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "zero_padding_2D_layer";
    private final int[] ZERO_PADDING = new int[]{2, 3};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.convolution;

 import org.deeplearning4j.nn.conf.layers.ZeroPadding3DLayer;
 import org.deeplearning4j.nn.conf.layers.ZeroPaddingLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -34,7 +35,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasZeroPadding3DTest {
+public class KerasZeroPadding3DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "zero_padding_3D_layer";
    private final int[] ZERO_PADDING = new int[]{2, 3, 4};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasActivationLayer.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasActivationLayer.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.layers.ActivationLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -27,7 +28,7 @@ import java.util.Map;

 import static org.junit.Assert.assertEquals;

-public class KerasActivationLayer {
+public class KerasActivationLayer extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.DenseLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasDenseTest {
+public class KerasDenseTest extends BaseDL4JTest {

    private Integer keras1 = 1;
    private Integer keras2 = 2;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDropoutTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDropoutTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasDropoutTest {
+public class KerasDropoutTest extends BaseDL4JTest {

    String LAYER_NAME = "dropout";
    private final double DROPOUT_KERAS = 0.3;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasMaskingTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasMaskingTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasMaskingTest {
+public class KerasMaskingTest extends BaseDL4JTest {


    private Keras1LayerConfiguration conf1 = new Keras1LayerConfiguration();
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -33,7 +34,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasPermuteTest {
+public class KerasPermuteTest extends BaseDL4JTest {

    private Integer keras1 = 1;
    private Integer keras2 = 2;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVectorTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVectorTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.layers.misc.RepeatVector;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -30,7 +31,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasRepeatVectorTest {
+public class KerasRepeatVectorTest extends BaseDL4JTest {

    String LAYER_NAME = "repeat";
    private int REPEAT = 4;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -36,7 +37,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasReshapeTest {
+public class KerasReshapeTest extends BaseDL4JTest {

    private Integer keras1 = 1;
    private Integer keras2 = 2;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasSpatialDropout2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasSpatialDropout2DTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.core;

 import org.deeplearning4j.nn.conf.dropout.SpatialDropout;
 import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasSpatialDropout2DTest {
+public class KerasSpatialDropout2DTest extends BaseDL4JTest {

    String LAYER_NAME = "spatial_dropout_2d";
    private final double RATE_KERAS = 0.3;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.embeddings;

 import org.deeplearning4j.nn.conf.layers.EmbeddingSequenceLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -36,7 +37,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasEmbeddingTest {
+public class KerasEmbeddingTest extends BaseDL4JTest {

    private final String LAYER_NAME = "embedding_sequence_layer";
    private final String INIT_KERAS = "glorot_normal";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java
@ -21,6 +21,7 @@ import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected1D;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasLocallyConnected1DTest {
+public class KerasLocallyConnected1DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java
@ -20,6 +20,7 @@ import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected2D;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -39,7 +40,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasLocallyConnected2DTest {
+public class KerasLocallyConnected2DTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasAlphaDropoutTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasAlphaDropoutTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.noise;

 import org.deeplearning4j.nn.conf.dropout.AlphaDropout;
 import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasAlphaDropoutTest {
+public class KerasAlphaDropoutTest extends BaseDL4JTest {

    String LAYER_NAME = "alpha_dropout";
    private final double RATE_KERAS = 0.3;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasGaussianDropoutTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasGaussianDropoutTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.noise;

 import org.deeplearning4j.nn.conf.dropout.GaussianDropout;
 import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasGaussianDropoutTest {
+public class KerasGaussianDropoutTest extends BaseDL4JTest {

    String LAYER_NAME = "gaussian_dropout";
    private final double RATE_KERAS = 0.3;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasGaussianNoiseTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/noise/KerasGaussianNoiseTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.noise;

 import org.deeplearning4j.nn.conf.dropout.GaussianNoise;
 import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasGaussianNoiseTest {
+public class KerasGaussianNoiseTest extends BaseDL4JTest {

    String LAYER_NAME = "gaussian_noise";
    private final double STDDEV = 0.3;
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalizationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalizationTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.normalization;

 import org.deeplearning4j.nn.conf.layers.BatchNormalization;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -32,7 +33,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasBatchNormalizationTest {
+public class KerasBatchNormalizationTest extends BaseDL4JTest {
    public static final String PARAM_NAME_BETA = "beta";
    private final String LAYER_NAME = "batch_norm_layer";

--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling1DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.pooling;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.layers.PoolingType;
 import org.deeplearning4j.nn.conf.layers.Subsampling1DLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -33,7 +34,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasPooling1DTest {
+public class KerasPooling1DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "test_layer";
    private final int[] KERNEL_SIZE = new int[]{2};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling2DTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.pooling;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.layers.PoolingType;
 import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -35,7 +36,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasPooling2DTest {
+public class KerasPooling2DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "test_layer";
    private final int[] KERNEL_SIZE = new int[]{1, 2};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java
@ -20,6 +20,7 @@ import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.layers.PoolingType;
 import org.deeplearning4j.nn.conf.layers.Subsampling3DLayer;
 import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -36,7 +37,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasPooling3DTest {
+public class KerasPooling3DTest extends BaseDL4JTest {

    private final String LAYER_NAME = "pooling_3d";
    private final int[] KERNEL_SIZE = new int[]{2, 2, 2};
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java
@ -21,6 +21,7 @@ import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LSTM;
 import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
 import org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -44,7 +45,7 @@ import static org.junit.Assert.assertNotNull;
 /**
 * @author Max Pumperla
 */
-public class KerasLSTMTest {
+public class KerasLSTMTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnnTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnnTest.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.recurrent;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
 import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
@ -35,7 +36,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasSimpleRnnTest {
+public class KerasSimpleRnnTest extends BaseDL4JTest {

    private final String ACTIVATION = "sigmoid";
    private final String LAYER_NAME = "simple_rnn_layer";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectionalTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectionalTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.layers.wrappers;

 import org.deeplearning4j.nn.conf.layers.LSTM;
 import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
@ -33,7 +34,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * @author Max Pumperla
 */
-public class KerasBidirectionalTest {
+public class KerasBidirectionalTest extends BaseDL4JTest {

    private final String ACTIVATION_KERAS = "linear";
    private final String ACTIVATION_DL4J = "identity";
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras.optimizers;

 import org.deeplearning4j.config.DL4JSystemProperties;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
 import org.deeplearning4j.nn.modelimport.keras.e2e.KerasModelEndToEndTest;
@ -33,7 +34,7 @@ import java.nio.file.StandardCopyOption;

 import static java.io.File.createTempFile;

-public class OptimizerImport {
+public class OptimizerImport extends BaseDL4JTest {

    @Test
    public void importAdam() throws Exception {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.nn.modelimport.keras.preprocessing.sequence;

+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.preprocessing.text.KerasTokenizer;
 import org.junit.Test;
@ -29,7 +30,7 @@ import java.io.IOException;
 *
 * @author Max Pumperla
 */
-public class TimeSeriesGeneratorImportTest {
+public class TimeSeriesGeneratorImportTest extends BaseDL4JTest {

    @Test
    public void importTimeSeriesTest() throws IOException, InvalidKerasConfigurationException {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.nn.modelimport.keras.preprocessing.sequence;

+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -24,7 +25,7 @@ import org.nd4j.linalg.primitives.Pair;

 import static org.junit.Assert.assertEquals;

-public class TimeSeriesGeneratorTest {
+public class TimeSeriesGeneratorTest extends BaseDL4JTest {

    @Test
    public void tsGeneratorTest() throws InvalidKerasConfigurationException {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.nn.modelimport.keras.preprocessing.text;

+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.junit.Test;
 import org.nd4j.linalg.io.ClassPathResource;
@ -33,7 +34,7 @@ import static org.junit.Assert.assertTrue;
 *
 * @author Max Pumperla
 */
-public class TokenizerImportTest {
+public class TokenizerImportTest extends BaseDL4JTest {

    ClassLoader classLoader = getClass().getClassLoader();

--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.nn.modelimport.keras.preprocessing.text;

+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;

@ -31,7 +32,7 @@ import static org.junit.Assert.assertEquals;
 *
 * @author Max Pumperla
 */
-public class TokenizerTest {
+public class TokenizerTest extends BaseDL4JTest {

    @Test
    public void tokenizerBasics() {
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.modelimport.keras.weights;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.graph.ComputationGraph;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasSpaceToDepth;
@ -42,7 +43,7 @@ import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;

@Slf4j
-public class KerasWeightSettingTests {
+public class KerasWeightSettingTests extends BaseDL4JTest {

    @Rule
    public final TemporaryFolder testDir = new TemporaryFolder();
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/src/test/java/org/deeplearning4j/nearestneighbor/server/BaseDL4JTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/src/test/java/org/deeplearning4j/nearestneighbor/server/BaseDL4JTest.java
@ -0,0 +1,140 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.deeplearning4j.nearestneighbor.server;
+
+import lombok.extern.slf4j.Slf4j;
+import org.bytedeco.javacpp.Pointer;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.TestName;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.memory.MemoryWorkspace;
+import org.nd4j.linalg.api.ops.executioner.OpExecutioner;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.profiler.ProfilerConfig;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+@Slf4j
+public class BaseDL4JTest {
+
+    @Rule
+    public TestName name = new TestName();
+
+    protected long startTime;
+    protected int threadCountBefore;
+
+    /**
+     * Override this to set the profiling mode for the tests defined in the child class
+     */
+    public OpExecutioner.ProfilingMode getProfilingMode(){
+        return OpExecutioner.ProfilingMode.SCOPE_PANIC;
+    }
+
+    /**
+     * Override this to set the datatype of the tests defined in the child class
+     */
+    public DataType getDataType(){
+        return DataType.DOUBLE;
+    }
+
+    public DataType getDefaultFPDataType(){
+        return getDataType();
+    }
+
+    @Before
+    public void beforeTest(){
+        log.info("{}.{}", getClass().getSimpleName(), name.getMethodName());
+        Nd4j.getExecutioner().setProfilingMode(getProfilingMode());
+        Nd4j.getExecutioner().setProfilingConfig(ProfilerConfig.builder().build());
+        Nd4j.setDefaultDataTypes(getDataType(), getDefaultFPDataType());
+        startTime = System.currentTimeMillis();
+        threadCountBefore = ManagementFactory.getThreadMXBean().getThreadCount();
+    }
+
+    @After
+    public void afterTest(){
+        //Attempt to keep workspaces isolated between tests
+        Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
+        MemoryWorkspace currWS = Nd4j.getMemoryManager().getCurrentWorkspace();
+        Nd4j.getMemoryManager().setCurrentWorkspace(null);
+        if(currWS != null){
+            //Not really safe to continue testing under this situation... other tests will likely fail with obscure
+            // errors that are hard to track back to this
+            log.error("Open workspace leaked from test! Exiting - {}, isOpen = {} - {}", currWS.getId(), currWS.isScopeActive(), currWS);
+            System.exit(1);
+        }
+
+        StringBuilder sb = new StringBuilder();
+        long maxPhys = Pointer.maxPhysicalBytes();
+        long maxBytes = Pointer.maxBytes();
+        long currPhys = Pointer.physicalBytes();
+        long currBytes = Pointer.totalBytes();
+
+        long jvmTotal = Runtime.getRuntime().totalMemory();
+        long jvmMax = Runtime.getRuntime().maxMemory();
+
+        int threadsAfter = ManagementFactory.getThreadMXBean().getThreadCount();
+
+        long duration = System.currentTimeMillis() - startTime;
+        sb.append(getClass().getSimpleName()).append(".").append(name.getMethodName())
+                .append(": ").append(duration).append(" ms")
+                .append(", threadCount: (").append(threadCountBefore).append("->").append(threadsAfter).append(")")
+                .append(", jvmTotal=").append(jvmTotal)
+                .append(", jvmMax=").append(jvmMax)
+                .append(", totalBytes=").append(currBytes).append(", maxBytes=").append(maxBytes)
+                .append(", currPhys=").append(currPhys).append(", maxPhys=").append(maxPhys);
+
+        List<MemoryWorkspace> ws = Nd4j.getWorkspaceManager().getAllWorkspacesForCurrentThread();
+        if(ws != null && ws.size() > 0){
+            long currSize = 0;
+            for(MemoryWorkspace w : ws){
+                currSize += w.getCurrentSize();
+            }
+            if(currSize > 0){
+                sb.append(", threadWSSize=").append(currSize)
+                        .append(" (").append(ws.size()).append(" WSs)");
+            }
+        }
+
+
+        Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+        Object o = p.get("cuda.devicesInformation");
+        if(o instanceof List){
+            List<Map<String,Object>> l = (List<Map<String, Object>>) o;
+            if(l.size() > 0) {
+
+                sb.append(" [").append(l.size())
+                        .append(" GPUs: ");
+
+                for (int i = 0; i < l.size(); i++) {
+                    Map<String,Object> m = l.get(i);
+                    if(i > 0)
+                        sb.append(",");
+                    sb.append("(").append(m.get("cuda.freeMemory")).append(" free, ")
+                            .append(m.get("cuda.totalMemory")).append(" total)");
+                }
+                sb.append("]");
+            }
+        }
+        log.info(sb.toString());
+    }
+}
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/src/test/java/org/deeplearning4j/nearestneighbor/server/NearestNeighborTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/src/test/java/org/deeplearning4j/nearestneighbor/server/NearestNeighborTest.java
@ -44,7 +44,7 @@ import static org.junit.Assert.assertEquals;
 /**
 * Created by agibsonccc on 4/27/17.
 */
-public class NearestNeighborTest {
+public class NearestNeighborTest extends BaseDL4JTest {

    @Rule
    public TemporaryFolder testDir = new TemporaryFolder();
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/kdtree/KDTree.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/kdtree/KDTree.java
@ -175,7 +175,7 @@ public class KDTree implements Serializable {
            return Pair.of(Double.POSITIVE_INFINITY, null);

        int _discNext = (_disc + 1) % dims;
-        double dist2 = Nd4j.getExecutioner().execAndReturn(new EuclideanDistance(point, Nd4j.zeros(point.shape()))).getFinalResult().doubleValue();
+        double dist2 = Nd4j.getExecutioner().execAndReturn(new EuclideanDistance(point, Nd4j.zeros(point.dataType(), point.shape()))).getFinalResult().doubleValue();
        if (dist2 < dist) {
            best = node.getPoint();
            dist = dist2;
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/BaseDL4JTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/BaseDL4JTest.java
@ -0,0 +1,140 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.deeplearning4j.clustering;
+
+import lombok.extern.slf4j.Slf4j;
+import org.bytedeco.javacpp.Pointer;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.TestName;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.memory.MemoryWorkspace;
+import org.nd4j.linalg.api.ops.executioner.OpExecutioner;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.profiler.ProfilerConfig;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+@Slf4j
+public class BaseDL4JTest {
+
+    @Rule
+    public TestName name = new TestName();
+
+    protected long startTime;
+    protected int threadCountBefore;
+
+    /**
+     * Override this to set the profiling mode for the tests defined in the child class
+     */
+    public OpExecutioner.ProfilingMode getProfilingMode(){
+        return OpExecutioner.ProfilingMode.SCOPE_PANIC;
+    }
+
+    /**
+     * Override this to set the datatype of the tests defined in the child class
+     */
+    public DataType getDataType(){
+        return DataType.DOUBLE;
+    }
+
+    public DataType getDefaultFPDataType(){
+        return getDataType();
+    }
+
+    @Before
+    public void beforeTest(){
+        log.info("{}.{}", getClass().getSimpleName(), name.getMethodName());
+        Nd4j.getExecutioner().setProfilingMode(getProfilingMode());
+        Nd4j.getExecutioner().setProfilingConfig(ProfilerConfig.builder().build());
+        Nd4j.setDefaultDataTypes(getDataType(), getDefaultFPDataType());
+        startTime = System.currentTimeMillis();
+        threadCountBefore = ManagementFactory.getThreadMXBean().getThreadCount();
+    }
+
+    @After
+    public void afterTest(){
+        //Attempt to keep workspaces isolated between tests
+        Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
+        MemoryWorkspace currWS = Nd4j.getMemoryManager().getCurrentWorkspace();
+        Nd4j.getMemoryManager().setCurrentWorkspace(null);
+        if(currWS != null){
+            //Not really safe to continue testing under this situation... other tests will likely fail with obscure
+            // errors that are hard to track back to this
+            log.error("Open workspace leaked from test! Exiting - {}, isOpen = {} - {}", currWS.getId(), currWS.isScopeActive(), currWS);
+            System.exit(1);
+        }
+
+        StringBuilder sb = new StringBuilder();
+        long maxPhys = Pointer.maxPhysicalBytes();
+        long maxBytes = Pointer.maxBytes();
+        long currPhys = Pointer.physicalBytes();
+        long currBytes = Pointer.totalBytes();
+
+        long jvmTotal = Runtime.getRuntime().totalMemory();
+        long jvmMax = Runtime.getRuntime().maxMemory();
+
+        int threadsAfter = ManagementFactory.getThreadMXBean().getThreadCount();
+
+        long duration = System.currentTimeMillis() - startTime;
+        sb.append(getClass().getSimpleName()).append(".").append(name.getMethodName())
+                .append(": ").append(duration).append(" ms")
+                .append(", threadCount: (").append(threadCountBefore).append("->").append(threadsAfter).append(")")
+                .append(", jvmTotal=").append(jvmTotal)
+                .append(", jvmMax=").append(jvmMax)
+                .append(", totalBytes=").append(currBytes).append(", maxBytes=").append(maxBytes)
+                .append(", currPhys=").append(currPhys).append(", maxPhys=").append(maxPhys);
+
+        List<MemoryWorkspace> ws = Nd4j.getWorkspaceManager().getAllWorkspacesForCurrentThread();
+        if(ws != null && ws.size() > 0){
+            long currSize = 0;
+            for(MemoryWorkspace w : ws){
+                currSize += w.getCurrentSize();
+            }
+            if(currSize > 0){
+                sb.append(", threadWSSize=").append(currSize)
+                        .append(" (").append(ws.size()).append(" WSs)");
+            }
+        }
+
+
+        Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+        Object o = p.get("cuda.devicesInformation");
+        if(o instanceof List){
+            List<Map<String,Object>> l = (List<Map<String, Object>>) o;
+            if(l.size() > 0) {
+
+                sb.append(" [").append(l.size())
+                        .append(" GPUs: ");
+
+                for (int i = 0; i < l.size(); i++) {
+                    Map<String,Object> m = l.get(i);
+                    if(i > 0)
+                        sb.append(",");
+                    sb.append("(").append(m.get("cuda.freeMemory")).append(" free, ")
+                            .append(m.get("cuda.totalMemory")).append(" total)");
+                }
+                sb.append("]");
+            }
+        }
+        log.info(sb.toString());
+    }
+}
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/kdtree/KDTreeTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/kdtree/KDTreeTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.clustering.kdtree;

 import com.google.common.primitives.Doubles;
 import lombok.val;
+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.joda.time.Duration;
 import org.junit.Before;
 import org.junit.BeforeClass;
@ -40,7 +41,7 @@ import static org.junit.Assert.assertTrue;
 /**
 * Created by agibsonccc on 1/1/15.
 */
-public class KDTreeTest {
+public class KDTreeTest extends BaseDL4JTest {

    private KDTree kdTree;

--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/kmeans/KMeansTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/kmeans/KMeansTest.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.clustering.kmeans;

 import org.apache.commons.lang3.time.StopWatch;
+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.deeplearning4j.clustering.algorithm.Distance;
 import org.deeplearning4j.clustering.cluster.*;
 import org.junit.Ignore;
@ -33,7 +34,7 @@ import static org.junit.Assert.fail;
 /**
 * Created by agibsonccc on 7/2/17.
 */
-public class KMeansTest {
+public class KMeansTest extends BaseDL4JTest {

    @Test
    public void testKMeans() {
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/lsh/RandomProjectionLSHTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/lsh/RandomProjectionLSHTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.clustering.lsh;

+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Ignore;
@ -31,7 +32,7 @@ import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;

-public class RandomProjectionLSHTest {
+public class RandomProjectionLSHTest extends BaseDL4JTest {

    int hashLength = 31;
    int numTables = 2;
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/quadtree/QuadTreeTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/quadtree/QuadTreeTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.clustering.quadtree;

+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
@ -26,7 +27,7 @@ import static org.junit.Assert.assertTrue;
 /**
 * Created by agibsonccc on 1/2/15.
 */
-public class QuadTreeTest {
+public class QuadTreeTest extends BaseDL4JTest {

    @Test
    public void testQuadTree() {
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/randomprojection/RPTreeTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/randomprojection/RPTreeTest.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.clustering.randomprojection;

+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator;
 import org.junit.Before;
 import org.junit.Test;
@ -31,7 +32,7 @@ import java.util.List;

 import static org.junit.Assert.*;

-public class RPTreeTest {
+public class RPTreeTest extends BaseDL4JTest {

    @Before
    public void setUp() {
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/randomprojection/RPUtilsTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/randomprojection/RPUtilsTest.java
@ -16,13 +16,14 @@

 package org.deeplearning4j.clustering.randomprojection;

+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;

 import static org.junit.Assert.assertEquals;

-public class RPUtilsTest {
+public class RPUtilsTest extends BaseDL4JTest {

    @Test
    public void testDistanceComputeBatch() {
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/sptree/SPTreeTest.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/test/java/org/deeplearning4j/clustering/sptree/SPTreeTest.java
@ -18,6 +18,7 @@ package org.deeplearning4j.clustering.sptree;

 import com.google.common.util.concurrent.AtomicDouble;
 import org.apache.commons.lang3.time.StopWatch;
+import org.deeplearning4j.clustering.BaseDL4JTest;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
@ -33,7 +34,7 @@ import static org.junit.Assert.*;
 /**
 * @author Adam Gibson
 */
-public class SPTreeTest {
+public class SPTreeTest extends BaseDL4JTest {

    @Before
    public void setUp() {
--- a/Show More
+++ b/Show More