From 9af4f9f23a6577b73dd31018a06ad80d237df607 Mon Sep 17 00:00:00 2001
From: brian <brian@brutex.de>
Date: Fri, 24 Mar 2023 15:04:06 +0100
Subject: [PATCH] Playing with some new code

Signed-off-by: brian <brian@brutex.de>
---
 .../src/test/java/net/brutex/gan/App.java     |   65 +-
 .../test/java/net/brutex/spark/BrianTest.java |    2 +-
 .../java/net/brutex/spark/BrianTest2.java     |    2 +-
 .../java/net/brutex/spark/TestServer.java     |    2 +-
 .../java/net/brutex/spark/TestServer2.java    |    2 +-
 .../IntegrationTestBaselineGenerator.java     |    4 +-
 .../integration/IntegrationTestRunner.java    |   28 +-
 .../deeplearning4j/integration/TestUtils.java |    4 +-
 .../java/org/deeplearning4j/TestUtils.java    |    4 +-
 .../org/deeplearning4j/eval/EvalTest.java     |    2 +-
 .../gradientcheck/BNGradientCheckTest.java    |   14 +-
 .../gradientcheck/CNN1DGradientCheckTest.java |    8 +-
 .../gradientcheck/CNN3DGradientCheckTest.java |    8 +-
 .../gradientcheck/CNNGradientCheckTest.java   |   14 +-
 .../GlobalPoolingGradientCheckTests.java      |    8 +-
 .../gradientcheck/GradientCheckTests.java     |   16 +-
 .../GradientCheckTestsComputationGraph.java   |   50 +-
 .../gradientcheck/LRNGradientCheckTests.java  |    2 +-
 .../gradientcheck/LSTMGradientCheckTests.java |    6 +-
 .../NoBiasGradientCheckTests.java             |    2 +-
 .../OutputLayerGradientChecks.java            |    6 +-
 .../gradientcheck/VaeGradientCheckTests.java  |    8 +-
 .../nn/conf/layers/LayerConfigTest.java       |   36 +-
 .../deeplearning4j/nn/dtypes/DTypeTests.java  |    4 +-
 .../nn/graph/ComputationGraphTestRNN.java     |   10 +-
 .../nn/graph/TestCompGraphUnsupervised.java   |    5 +-
 .../nn/graph/TestComputationGraphNetwork.java |   28 +-
 .../nn/layers/FrozenLayerTest.java            |    2 +-
 .../deeplearning4j/nn/layers/TestDropout.java |    2 +-
 .../embedding/EmbeddingLayerTest.java         |    4 +-
 .../nn/layers/ocnn/OCNNOutputLayerTest.java   |    2 +-
 .../samediff/testlayers/SameDiffDense.java    |    2 +-
 .../testlayers/SameDiffDenseVertex.java       |    4 +-
 .../nn/misc/WorkspaceTests.java               |    8 +-
 .../nn/multilayer/MultiLayerTest.java         |    2 +-
 .../nn/multilayer/MultiLayerTestRNN.java      |    2 +-
 .../rl/TestMultiModelGradientApplication.java |    4 +-
 .../TestTransferLearningModelSerializer.java  |    2 +-
 .../TransferLearningCompGraphTest.java        |    6 +-
 .../TransferLearningHelperTest.java           |    2 +-
 .../optimize/solver/TestOptimizers.java       |    4 +-
 .../regressiontest/RegressionTest060.java     |    2 +-
 .../regressiontest/RegressionTest071.java     |    2 +-
 .../regressiontest/RegressionTest080.java     |    2 +-
 .../regressiontest/RegressionTest100a.java    |    2 +-
 .../regressiontest/RegressionTest100b3.java   |    2 +-
 .../regressiontest/RegressionTest100b4.java   |    2 +-
 .../regressiontest/RegressionTest100b6.java   |    2 +-
 .../customlayer100a/CustomLayer.java          |    6 +-
 .../util/CrashReportingUtilTest.java          |   12 +-
 .../util/ModelSerializerTest.java             |    4 +-
 .../cuda/recurrent/CudnnLSTMHelper.java       |    2 +-
 .../nn/modelimport/keras/KerasLayer.java      |    8 +-
 .../nn/modelimport/keras/KerasModel.java      |    4 +-
 .../keras/config/KerasLayerConfiguration.java |    2 +-
 .../keras/layers/core/KerasDense.java         |    4 +-
 .../keras/layers/recurrent/KerasLSTM.java     |    4 +-
 .../layers/recurrent/KerasSimpleRnn.java      |    2 +-
 .../layers/wrappers/KerasBidirectional.java   |    4 +-
 .../configurations/FullModelComparisons.java  |    4 +-
 .../brutex/ai/dnn/api/LayerConfiguration.java |    9 +
 cavis-dnn/cavis-dnn-nn/build.gradle           |    6 +-
 .../ILayer.java}                              |   22 +-
 .../ILayerConfiguration.java}                 |   57 +-
 .../java/net/brutex/ai/dnn/api/IModel.java    |   86 +
 .../brutex/ai/dnn/api/INeuralNetwork.java}    |   53 +-
 .../dnn/api/INeuralNetworkConfiguration.java  |   52 +
 .../dnn/conf/NeuralNetworkConfiguration.java  |  708 +-
 .../layer/AbstractLayerConfiguration.java     |   10 +-
 .../conf/layer/DenseLayerConfiguration.java   |   62 +
 .../layer/FeedForwardLayerConfiguration.java  |   99 +
 .../impl/network/AbstractNeuralNetwork.java   |   72 -
 .../ai/dnn/impl/network/NeuralNetwork.java    |  692 --
 .../dnn/networks/ArtificialNeuralNetwork.java |   53 +
 .../trainer/BaseEarlyStoppingTrainer.java     |    2 +-
 .../gradientcheck/GradientCheckUtil.java      |    6 +-
 .../java/org/deeplearning4j/nn/api/Layer.java |  377 +-
 .../deeplearning4j/nn/api/ModelAdapter.java   |    2 +-
 .../nn/api/ParamInitializer.java              |   10 +-
 .../deeplearning4j/nn/api/TrainingConfig.java |    2 +-
 .../org/deeplearning4j/nn/api/Updater.java    |    2 +-
 .../nn/api/layers/LayerConstraint.java        |    2 +-
 .../nn/api/layers/RecurrentLayer.java         |    6 +-
 .../nn/conf/NeuralNetConfiguration.java       |    5 +-
 .../nn/conf/constraint/MaxNormConstraint.java |    4 +-
 .../conf/constraint/MinMaxNormConstraint.java |    6 +-
 .../conf/constraint/UnitNormConstraint.java   |    4 +-
 .../nn/conf/graph/LayerVertex.java            |    7 +-
 .../nn/conf/layers/ActivationLayer.java       |    2 +-
 .../nn/conf/layers/BaseLayer.java             |    4 +-
 .../nn/conf/layers/CapsuleLayer.java          |    4 +-
 .../nn/conf/layers/DenseLayer.java            |    4 +-
 .../deeplearning4j/nn/conf/layers/Layer.java  |    4 +-
 .../nn/conf/layers/LayerValidation.java       |    4 +-
 .../layers/LocalResponseNormalization.java    |    2 +-
 .../nn/conf/layers/PrimaryCapsules.java       |    2 +-
 .../misc/ElementWiseMultiplicationLayer.java  |    2 +-
 .../layers/recurrent/TimeDistributed.java     |    2 +-
 .../layers/samediff/SameDiffLambdaLayer.java  |    2 +-
 .../layers/samediff/SameDiffLambdaVertex.java |    2 +-
 .../layers/wrapper/BuildingBlockLayer.java    |   97 -
 .../nn/conf/memory/NetworkMemoryReport.java   |    2 +-
 .../nn/conf/weightnoise/IWeightNoise.java     |    2 +-
 .../nn/graph/ComputationGraph.java            |  230 +-
 .../nn/graph/vertex/BaseGraphVertex.java      |    4 +-
 .../nn/graph/vertex/GraphVertex.java          |    4 +-
 .../nn/graph/vertex/impl/LayerVertex.java     |    6 +-
 .../impl/rnn/DuplicateToTimeSeriesVertex.java |    4 +-
 .../vertex/impl/rnn/LastTimeStepVertex.java   |    4 +-
 .../impl/rnn/ReverseTimeSeriesVertex.java     |    4 +-
 .../nn/layers/recurrent/LSTMHelpers.java      |    2 +-
 .../nn/multilayer/MultiLayerNetwork.java      | 8061 +++++++++--------
 .../nn/transferlearning/TransferLearning.java |    2 +-
 .../TransferLearningHelper.java               |    4 +-
 .../nn/updater/BaseMultiLayerUpdater.java     |    4 +-
 .../optimize/api/TrainingListener.java        |    4 +-
 .../listeners/CheckpointListener.java         |    4 +-
 .../optimize/solvers/BaseOptimizer.java       |    6 +-
 .../util/Convolution1DUtils.java              |    2 +-
 .../util/CrashReportingUtil.java              |   30 +-
 .../deeplearning4j/util/ModelSerializer.java  |    2 +-
 .../org/deeplearning4j/util/NetworkUtils.java |    8 +-
 .../deeplearning4j/util/OutputLayerUtil.java  |    2 +-
 .../deeplearning4j/util/TimeSeriesUtils.java  |    2 +-
 .../java/net/brutex/ai/dnn/api/dnnTest.java   |  127 +
 .../brutex/ai/dnn/conf/layer/FFLayerTest.java |   47 +
 .../nn/layers/HelperUtilsTest.java            |    2 +-
 .../parallelism/InplaceParallelInference.java |    3 +-
 .../parallelism/ParallelInference.java        |    2 +-
 .../parallelism/trainer/DefaultTrainer.java   |    4 +-
 .../impl/graph/SparkComputationGraph.java     |    2 +-
 ...VaeReconstructionErrorWithKeyFunction.java |    2 +-
 ...GVaeReconstructionProbWithKeyFunction.java |    2 +-
 ...VaeReconstructionErrorWithKeyFunction.java |    2 +-
 .../VaeReconstructionProbWithKeyFunction.java |    2 +-
 .../ParameterAveragingTrainingMaster.java     |    4 +-
 .../spark/impl/misc/TestFrozenLayers.java     |    4 +-
 ...TestSparkMultiLayerParameterAveraging.java |   10 +-
 .../pw/SharedTrainingWrapper.java             |    6 +-
 .../training/SharedTrainingMaster.java        |    2 +-
 .../ui/model/stats/BaseStatsListener.java     |    5 +-
 .../ui/model/stats/impl/SbeStatsReport.java   |    4 +-
 .../ui/module/train/TrainModuleUtils.java     |    8 +-
 .../templates/TrainingModel.html.ftl          |    6 +-
 .../org/deeplearning4j/zoo/TestUtils.java     |    2 +-
 settings.gradle                               |    2 +-
 146 files changed, 6151 insertions(+), 5493 deletions(-)
 rename cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/{conf/layer/LayerConfiguration.java => api/ILayer.java} (60%)
 rename cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/{conf/layer/FFLayer.java => api/ILayerConfiguration.java} (56%)
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/IModel.java
 rename cavis-dnn/cavis-dnn-nn/src/main/java/{org/deeplearning4j/nn/api/NeuralNetwork.java => net/brutex/ai/dnn/api/INeuralNetwork.java} (58%)
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetworkConfiguration.java
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/DenseLayerConfiguration.java
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FeedForwardLayerConfiguration.java
 delete mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/AbstractNeuralNetwork.java
 delete mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/NeuralNetwork.java
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/networks/ArtificialNeuralNetwork.java
 delete mode 100644 cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/wrapper/BuildingBlockLayer.java
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/api/dnnTest.java
 create mode 100644 cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/conf/layer/FFLayerTest.java

diff --git a/brutex-extended-tests/src/test/java/net/brutex/gan/App.java b/brutex-extended-tests/src/test/java/net/brutex/gan/App.java
index f5b47031b..fca68610a 100644
--- a/brutex-extended-tests/src/test/java/net/brutex/gan/App.java
+++ b/brutex-extended-tests/src/test/java/net/brutex/gan/App.java
@@ -21,8 +21,19 @@
 
 package net.brutex.gan;
 
-import java.util.List;
+import java.awt.BorderLayout;
+import java.awt.Dimension;
+import java.awt.GridLayout;
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.util.Arrays;
 import java.util.Random;
+import javax.swing.ImageIcon;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.WindowConstants;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.ArrayUtils;
 import org.datavec.api.split.FileSplit;
@@ -34,20 +45,23 @@ import org.datavec.image.transform.PipelineImageTransform;
 import org.datavec.image.transform.ResizeImageTransform;
 import org.datavec.image.transform.ShowImageTransform;
 import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
-import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.GradientNormalization;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
-import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.ActivationLayer;
+import org.deeplearning4j.nn.conf.layers.DenseLayer;
+import org.deeplearning4j.nn.conf.layers.DropoutLayer;
+import org.deeplearning4j.nn.conf.layers.Layer;
+import org.deeplearning4j.nn.conf.layers.OutputLayer;
 import org.deeplearning4j.nn.conf.layers.misc.FrozenLayerWithBackprop;
-import org.deeplearning4j.nn.conf.layers.wrapper.BuildingBlockLayer;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.deeplearning4j.optimize.listeners.ScoreToChartListener;
 import org.junit.jupiter.api.Test;
 import org.nd4j.linalg.activations.Activation;
+import org.nd4j.linalg.activations.impl.ActivationIdentity;
 import org.nd4j.linalg.activations.impl.ActivationLReLU;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.dataset.DataSet;
@@ -55,13 +69,6 @@ import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.learning.config.Adam;
 import org.nd4j.linalg.learning.config.IUpdater;
-
-
-import javax.swing.*;
-import java.awt.*;
-import java.awt.image.BufferedImage;
-import java.io.File;
-import java.util.Arrays;
 import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction;
 
 @Slf4j
@@ -106,7 +113,7 @@ public class App {
    * @return config
    */
   private static MultiLayerConfiguration generator() {
-    MultiLayerConfiguration confxx = new NeuralNetConfiguration.Builder()
+    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
         .seed(42)
         .updater(UPDATER)
         .gradientNormalization(GradientNormalization.RenormalizeL2PerLayer)
@@ -117,23 +124,8 @@ public class App {
         .setInputType(InputType.convolutional(X_DIM, Y_DIM, CHANNELS))
        // .inputPreProcessor("CNN1", new FeedForwardToCnnPreProcessor(Y_DIM, X_DIM, CHANNELS))
         .build();
-    log.debug("Generator network: \n{}", confxx.toJson());
 
-    NeuralNetworkConfiguration conf2 = NeuralNetworkConfiguration.builder().build();
-
-    NeuralNetworkConfiguration confx = NeuralNetworkConfiguration.builder()
-        .cacheMode(CacheMode.HOST)
-        .layer( new DenseLayer.Builder().build())
-        .layer( new DenseLayer.Builder().build())
-        .layer( BuildingBlockLayer.builder().build())
-        .layers( List.of(genLayers()))
-        .inputType(InputType.convolutional(X_DIM, Y_DIM, CHANNELS))
-        .build();
-
-
-
-
-    return confx;
+   return conf;
   }
 
   private static Layer[] disLayers() {
@@ -155,6 +147,7 @@ public class App {
   }
 
   private static MultiLayerConfiguration discriminator() {
+
     MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
         .seed(42)
         .updater(UPDATER)
@@ -183,13 +176,13 @@ public class App {
 
     MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
         .seed(42)
-        .updater(UPDATER)
-        .gradientNormalization(GradientNormalization.RenormalizeL2PerLayer)
-        .gradientNormalizationThreshold(GRADIENT_THRESHOLD)
-        .weightInit(WeightInit.XAVIER)
-        .activation(Activation.IDENTITY)
-        .list(layers)
-        .setInputType(InputType.convolutional(X_DIM, Y_DIM, CHANNELS))
+        .updater( Adam.builder().learningRate(0.0002).beta1(0.5).build() )
+        .gradientNormalization( GradientNormalization.RenormalizeL2PerLayer)
+        .gradientNormalizationThreshold( 100 )
+        .weightInit( new WeightInitXavier() )
+        .activation( new ActivationIdentity())
+        .list(  layers  )
+        .setInputType( InputType.convolutional(X_DIM, Y_DIM, CHANNELS))
         .build();
 
     return conf;
diff --git a/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest.java b/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest.java
index efb54aa29..bc0aafa13 100644
--- a/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest.java
+++ b/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest.java
@@ -295,7 +295,7 @@ public class BrianTest extends BaseSparkSessionTest {
             .activation(Activation.RELU).l2(0.001).build())
         .layer(1, new DenseLayer.Builder().nIn(20).nOut(20).weightInit(WeightInit.XAVIER)
             .activation(Activation.RELU).build())
-        //.layer(2, new DenseLayer.Builder().nIn(9).nOut(9).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build())
+        //.layer(2, new DenseLayerConfiguration.Builder().nIn(9).nOut(9).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build())
         .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.XENT).nIn(20).nOut(4)
             .weightInit(WeightInit.XAVIER).activation(Activation.SIGMOID).build())
         .build();
diff --git a/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest2.java b/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest2.java
index 4e340c69a..f32c3c4de 100644
--- a/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest2.java
+++ b/brutex-extended-tests/src/test/java/net/brutex/spark/BrianTest2.java
@@ -301,7 +301,7 @@ public class BrianTest2 /*extends BaseDL4JTest*/ {
                 .list()
                     .layer(0, new DenseLayer.Builder().nIn(5).nOut(20).weightInit(WeightInit.XAVIER).activation(Activation.RELU).l2(0.001).build())
                     .layer(1, new DenseLayer.Builder().nIn(20).nOut(20).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build())
-                    //.layer(2, new DenseLayer.Builder().nIn(9).nOut(9).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build())
+                    //.layer(2, new DenseLayerConfiguration.Builder().nIn(9).nOut(9).weightInit(WeightInit.XAVIER).activation(Activation.RELU).build())
                     .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.XENT).nIn(20).nOut(4).weightInit(WeightInit.XAVIER).activation(Activation.SIGMOID).build())
                 .build();
 
diff --git a/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer.java b/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer.java
index 353195da4..b81f70fc8 100644
--- a/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer.java
+++ b/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer.java
@@ -95,7 +95,7 @@ public class TestServer {
                 .list()
                 //.layer(0, new ConvolutionLayer.Builder().nIn(1).kernelSize(1, 5).stride(1,1).padding(0,2).nOut(1).name("1st Filter").updater(new Adam.Builder().learningRate(0.2).build()).build())
                //.layer(1, new ConvolutionLayer.Builder().nIn(1).kernelSize(1, 2).stride(1,2).padding(0,0).nOut(1).name("2nd Filter").updater(new Adam.Builder().learningRate(0.1).build()).build())
-              //  .layer(1, new DenseLayer.Builder().nIn(10).nOut(64).activation(Activation.RELU).build())
+              //  .layer(1, new DenseLayerConfiguration.Builder().nIn(10).nOut(64).activation(Activation.RELU).build())
                 .layer(0, new DenseLayer.Builder().nIn(10).nOut(100).activation(Activation.RELU).l2(0.003).build())
                 .layer(1, new LSTM.Builder().nIn(100).nOut(100).activation(Activation.TANH).build())
                 .layer(2, new LSTM.Builder().nIn(100).nOut(100).activation(Activation.TANH).build())
diff --git a/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer2.java b/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer2.java
index d6ac22e11..ac625f2b6 100644
--- a/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer2.java
+++ b/brutex-extended-tests/src/test/java/net/brutex/spark/TestServer2.java
@@ -131,7 +131,7 @@ public class TestServer2 {
                 .list()
                 //.layer(0, new ConvolutionLayer.Builder().nIn(1).kernelSize(1, 5).stride(1,1).padding(0,2).nOut(1).name("1st Filter").updater(new Adam.Builder().learningRate(0.2).build()).build())
                //.layer(1, new ConvolutionLayer.Builder().nIn(1).kernelSize(1, 2).stride(1,2).padding(0,0).nOut(1).name("2nd Filter").updater(new Adam.Builder().learningRate(0.1).build()).build())
-              //  .layer(1, new DenseLayer.Builder().nIn(10).nOut(64).activation(Activation.RELU).build())
+              //  .layer(1, new DenseLayerConfiguration.Builder().nIn(10).nOut(64).activation(Activation.RELU).build())
                 .layer(0, new DenseLayer.Builder().nIn(10).nOut(100).activation(Activation.RELU).l2(0.003).build())
                 .layer(1, new LSTM.Builder().nIn(100).nOut(100).activation(Activation.TANH).build())
                 .layer(2, new LSTM.Builder().nIn(100).nOut(100).activation(Activation.TANH).build())
diff --git a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestBaselineGenerator.java b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestBaselineGenerator.java
index 7c4bcc9ac..8111d2b7d 100644
--- a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestBaselineGenerator.java
+++ b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestBaselineGenerator.java
@@ -284,7 +284,7 @@ public class IntegrationTestBaselineGenerator {
                 INDArray paramsPostTraining;
                 if (modelType == ModelType.MLN) {
                     int[] layersToTrain = tc.getUnsupervisedTrainLayersMLN();
-                    Preconditions.checkState(layersToTrain != null, "Layer indices must not be null");
+                    Preconditions.checkState(layersToTrain != null, "ILayer indices must not be null");
                     DataSetIterator dsi = new MultiDataSetWrapperIterator(iter);
 
                     for (int i : layersToTrain) {
@@ -293,7 +293,7 @@ public class IntegrationTestBaselineGenerator {
                     paramsPostTraining = mln.params();
                 } else if (modelType == ModelType.CG) {
                     String[] layersToTrain = tc.getUnsupervisedTrainLayersCG();
-                    Preconditions.checkState(layersToTrain != null, "Layer names must not be null");
+                    Preconditions.checkState(layersToTrain != null, "ILayer names must not be null");
 
                     for (String i : layersToTrain) {
                         cg.pretrainLayer(i, iter);
diff --git a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestRunner.java b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestRunner.java
index fbc0d60a3..489c8021d 100644
--- a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestRunner.java
+++ b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/IntegrationTestRunner.java
@@ -200,7 +200,7 @@ public class IntegrationTestRunner {
                 m = cg;
 
                 ComputationGraph loaded = ComputationGraph.load(savedModel, true);
-                assertEquals(loaded.getConfiguration(), cg.getConfiguration(), "Configs not equal" );
+                assertEquals(loaded.getComputationGraphConfiguration(), cg.getComputationGraphConfiguration(), "Configs not equal" );
                 assertEquals( loaded.params(), cg.params(), "Params not equal");
                 assertEquals(loaded.paramTable(), cg.paramTable(), "Param table not equal");
             } else if(config instanceof SameDiff){
@@ -383,7 +383,7 @@ public class IntegrationTestRunner {
             org.deeplearning4j.nn.api.Layer[] layers;
             if(modelType == ModelType.MLN){
                 int[] layersToTrain = tc.getUnsupervisedTrainLayersMLN();
-                Preconditions.checkState(layersToTrain != null, "Layer indices must not be null");
+                Preconditions.checkState(layersToTrain != null, "ILayer indices must not be null");
                 DataSetIterator dsi = new MultiDataSetWrapperIterator(iter);
 
                 for( int i : layersToTrain){
@@ -393,7 +393,7 @@ public class IntegrationTestRunner {
                 layers = mln.getLayers();
             } else if(modelType == ModelType.CG) {
                 String[] layersToTrain = tc.getUnsupervisedTrainLayersCG();
-                Preconditions.checkState(layersToTrain != null, "Layer names must not be null");
+                Preconditions.checkState(layersToTrain != null, "ILayer names must not be null");
 
                 for( String i : layersToTrain){
                     cg.pretrainLayer(i, iter);
@@ -429,8 +429,8 @@ public class IntegrationTestRunner {
                 isTbptt = mln.getLayerWiseConfigurations().getBackpropType() == BackpropType.TruncatedBPTT;
                 tbpttLength = mln.getLayerWiseConfigurations().getTbpttFwdLength();
             } else if(modelType == ModelType.CG) {
-                isTbptt = cg.getConfiguration().getBackpropType() == BackpropType.TruncatedBPTT;
-                tbpttLength = cg.getConfiguration().getTbpttFwdLength();
+                isTbptt = cg.getComputationGraphConfiguration().getBackpropType() == BackpropType.TruncatedBPTT;
+                tbpttLength = cg.getComputationGraphConfiguration().getTbpttFwdLength();
             } else {
                 isTbptt = false;
                 tbpttLength = 0;
@@ -458,11 +458,11 @@ public class IntegrationTestRunner {
                 epochAfter = mln.getEpochCount();
                 layers = mln.getLayers();
             } else if(modelType == ModelType.CG){
-                iterBefore = cg.getConfiguration().getIterationCount();
-                epochBefore = cg.getConfiguration().getEpochCount();
+                iterBefore = cg.getComputationGraphConfiguration().getIterationCount();
+                epochBefore = cg.getComputationGraphConfiguration().getEpochCount();
                 cg.fit(countingIter);
-                iterAfter = cg.getConfiguration().getIterationCount();
-                epochAfter = cg.getConfiguration().getEpochCount();
+                iterAfter = cg.getComputationGraphConfiguration().getIterationCount();
+                epochAfter = cg.getComputationGraphConfiguration().getEpochCount();
                 layers = cg.getLayers();
             } else {
                 iterBefore = sd.getTrainingConfig().getIterationCount();
@@ -611,7 +611,7 @@ public class IntegrationTestRunner {
             } else if(modelType == ModelType.CG){
                 ModelSerializer.writeModel(m, f, true);
                 ComputationGraph restored = ComputationGraph.load(f, true);
-                assertEquals(cg.getConfiguration(), restored.getConfiguration());
+                assertEquals(cg.getComputationGraphConfiguration(), restored.getComputationGraphConfiguration());
                 assertEquals(cg.params(), restored.params());
             } else {
                 sd.save(f, true);
@@ -745,7 +745,7 @@ public class IntegrationTestRunner {
             preProcessors = mln.getLayerWiseConfigurations().getInputPreProcessors().values();
         } else {
             preProcessors = new ArrayList<>();
-            for (org.deeplearning4j.nn.conf.graph.GraphVertex gv : cg.getConfiguration().getVertices().values()) {
+            for (org.deeplearning4j.nn.conf.graph.GraphVertex gv : cg.getComputationGraphConfiguration().getVertices().values()) {
                 if (gv instanceof LayerVertex) {
                     InputPreProcessor pp = ((LayerVertex) gv).getPreProcessor();
                     if (pp != null) {
@@ -760,7 +760,7 @@ public class IntegrationTestRunner {
 
         //Collect vertex coverage information
         if (!isMLN) {
-            for (org.deeplearning4j.nn.conf.graph.GraphVertex gv : cg.getConfiguration().getVertices().values()) {
+            for (org.deeplearning4j.nn.conf.graph.GraphVertex gv : cg.getComputationGraphConfiguration().getVertices().values()) {
                 vertexConfClassesSeen.put(gv.getClass(), vertexConfClassesSeen.getOrDefault(gv.getClass(), 0) + 1);
             }
         }
@@ -872,14 +872,14 @@ public class IntegrationTestRunner {
 
         log.info("||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||");
 
-        log.info("Layer coverage - classes seen:");
+        log.info("ILayer coverage - classes seen:");
         for (Class<?> c : layerClasses) {
             if (layerConfClassesSeen.containsKey(c)) {
                 log.info("Class seen {} times in tests: {}", layerConfClassesSeen.get(c), c.getName());
             }
         }
 
-        log.info("Layer classes NOT seen in any tests:");
+        log.info("ILayer classes NOT seen in any tests:");
         for (Class<?> c : layerClasses) {
             if (!layerConfClassesSeen.containsKey(c)) {
                 log.info("Class NOT seen in any tests: {}", c.getName());
diff --git a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/TestUtils.java b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/TestUtils.java
index 5c16cc908..e03f2a523 100644
--- a/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/TestUtils.java
+++ b/brutex-extended-tests/src/test/java/org/deeplearning4j/integration/TestUtils.java
@@ -73,7 +73,7 @@ public class TestUtils {
             ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
             restored = ModelSerializer.restoreComputationGraph(bais, true);
 
-            assertEquals(net.getConfiguration(), restored.getConfiguration());
+            assertEquals(net.getComputationGraphConfiguration(), restored.getComputationGraphConfiguration());
             assertEquals(net.params(), restored.params());
         } catch (IOException e){
             //Should never happen
@@ -81,7 +81,7 @@ public class TestUtils {
         }
 
         //Also check the ComputationGraphConfiguration is serializable (required by Spark etc)
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         serializeDeserializeJava(conf);
 
         return restored;
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/TestUtils.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/TestUtils.java
index f1e12d123..cecc969ac 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/TestUtils.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/TestUtils.java
@@ -90,7 +90,7 @@ public class TestUtils {
             ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
             restored = ModelSerializer.restoreComputationGraph(bais, true);
 
-            assertEquals(net.getConfiguration(), restored.getConfiguration());
+            assertEquals(net.getComputationGraphConfiguration(), restored.getComputationGraphConfiguration());
             assertEquals(net.params(), restored.params());
         } catch (IOException e){
             //Should never happen
@@ -98,7 +98,7 @@ public class TestUtils {
         }
 
         //Also check the ComputationGraphConfiguration is serializable (required by Spark etc)
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         serializeDeserializeJava(conf);
 
         return restored;
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/eval/EvalTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/eval/EvalTest.java
index 30cb1e5ca..7b44d26c9 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/eval/EvalTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/eval/EvalTest.java
@@ -626,7 +626,7 @@ public class EvalTest extends BaseDL4JTest {
         net.evaluate(iter);
         net.evaluateROCMultiClass(iter, 0);
 
-        cg.getConfiguration().setValidateOutputLayerConfig(false);
+        cg.getComputationGraphConfiguration().setValidateOutputLayerConfig(false);
         cg.evaluate(iter);
         cg.evaluateROCMultiClass(iter, 0);
     }
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java
index 65f8787d8..f45861f57 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/BNGradientCheckTest.java
@@ -90,7 +90,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
             mln.init();
 
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
 
             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -135,7 +135,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
             mln.init();
 
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
 
             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -237,7 +237,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
                                     + ", outputActivation=" + outputActivation + ", doLearningFirst="
                                     + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
 //                            for (int k = 0; k < mln.getnLayers(); k++)
-//                                System.out.println("Layer " + k + " # params: " + mln.getLayer(k).numParams());
+//                                System.out.println("ILayer " + k + " # params: " + mln.getLayer(k).numParams());
 
                             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
                             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -341,7 +341,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
                                     + ", outputActivation=" + outputActivation + ", doLearningFirst="
                                     + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
 //                            for (int k = 0; k < mln.getnLayers(); k++)
-//                                System.out.println("Layer " + k + " # params: " + mln.getLayer(k).numParams());
+//                                System.out.println("ILayer " + k + " # params: " + mln.getLayer(k).numParams());
 
                             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
                             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -385,7 +385,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
             mln.init();
 
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
 
             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -430,7 +430,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
             mln.init();
 
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
 
             //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
             //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
@@ -572,7 +572,7 @@ public class BNGradientCheckTest extends BaseDL4JTest {
                                 + ", outputActivation=" + outputActivation + ", doLearningFirst="
                                 + doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
 //                        for (int k = 0; k < net.getNumLayers(); k++)
-//                            System.out.println("Layer " + k + " # params: " + net.getLayer(k).numParams());
+//                            System.out.println("ILayer " + k + " # params: " + net.getLayer(k).numParams());
 
                         //Mean and variance vars are not gradient checkable; mean/variance "gradient" is used to implement running mean/variance calc
                         //i.e., runningMean = decay * runningMean + (1-decay) * batchMean
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN1DGradientCheckTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN1DGradientCheckTest.java
index b61c1fe24..b9f461775 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN1DGradientCheckTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN1DGradientCheckTest.java
@@ -118,7 +118,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         System.out.println(msg);
 //                        for (int j = 0; j < net.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -198,7 +198,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             System.out.println(msg);
 //                            for (int j = 0; j < net.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -282,7 +282,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             System.out.println(msg);
 //                            for (int j = 0; j < net.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -359,7 +359,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             System.out.println(msg);
 //                            for (int j = 0; j < net.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java
index 4d3de0bfb..1f4a1ceec 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNN3DGradientCheckTest.java
@@ -149,7 +149,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {
                                             if (PRINT_RESULTS) {
                                                 log.info(msg);
 //                                                for (int j = 0; j < net.getnLayers(); j++) {
-//                                                    log.info("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                                                    log.info("ILayer " + j + " # params: " + net.getLayer(j).numParams());
 //                                                }
                                             }
 
@@ -252,7 +252,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         log.info(msg);
 //                        for (int j = 0; j < net.getnLayers(); j++) {
-//                            log.info("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                            log.info("ILayer " + j + " # params: " + net.getLayer(j).numParams());
 //                        }
                     }
 
@@ -431,7 +431,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             log.info(msg);
 //                            for (int j = 0; j < net.getnLayers(); j++) {
-//                                log.info("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                                log.info("ILayer " + j + " # params: " + net.getLayer(j).numParams());
 //                            }
                         }
 
@@ -530,7 +530,7 @@ public class CNN3DGradientCheckTest extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         log.info(msg);
 //                        for (int j = 0; j < net.getnLayers(); j++) {
-//                            log.info("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                            log.info("ILayer " + j + " # params: " + net.getLayer(j).numParams());
 //                        }
                     }
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
index b9536ee41..b737fcf79 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/CNNGradientCheckTest.java
@@ -137,7 +137,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                         System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation="
                                 + outputActivation + ", doLearningFirst=" + doLearningFirst);
 //                        for (int j = 0; j < mln.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -231,7 +231,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                         + ", outputActivation=" + outputActivation + ", doLearningFirst="
                         + doLearningFirst);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -293,7 +293,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(msg);
 //                    for (int j = 0; j < net.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                 }
                 boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                         DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
@@ -361,7 +361,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         System.out.println(msg);
 //                        for (int j = 0; j < net.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                     }
                     boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
                             DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
@@ -427,7 +427,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < net.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -500,7 +500,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         System.out.println(msg);
 //                        for (int j = 0; j < net.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -920,7 +920,7 @@ public class CNNGradientCheckTest extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < net.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java
index 7cb10f83b..36574096d 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GlobalPoolingGradientCheckTests.java
@@ -95,7 +95,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                     System.out.println("testLSTMGlobalPoolingBasicMultiLayer() - " + pt + ", minibatch = "
                                     + miniBatchSize);
 //                    for (int j = 0; j < mln.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -156,7 +156,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         System.out.println("testCnnGlobalPoolingBasicMultiLayer() - " + pt + ", minibatch = " + miniBatchSize + " - " + (nchw ? "NCHW" : "NHWC"));
 //                    for (int j = 0; j < mln.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -216,7 +216,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println("testLSTMGlobalPoolingBasicMultiLayer() - " + pt + ", minibatch = " + miniBatchSize);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.MLNConfig().net(mln).input(input)
@@ -299,7 +299,7 @@ public class GlobalPoolingGradientCheckTests extends BaseDL4JTest {
                         System.out.println("testCnnGlobalPoolingBasicMultiLayer() - " + pt + ", minibatch = "
                                         + miniBatchSize);
 //                        for (int j = 0; j < mln.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.MLNConfig().net(mln).input(input)
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTests.java
index cab80a69a..553477bd5 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTests.java
@@ -123,7 +123,7 @@ public class GradientCheckTests extends BaseDL4JTest {
                     + lf + ", outputActivation=" + outputActivation + ", doLearningFirst="
                     + doLearningFirst);
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -203,7 +203,7 @@ public class GradientCheckTests extends BaseDL4JTest {
                                         + lf + ", outputActivation=" + outputActivation + ", doLearningFirst="
                                         + doLearningFirst);
 //                        for (int j = 0; j < mln.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                     }
 
                     boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -297,7 +297,7 @@ public class GradientCheckTests extends BaseDL4JTest {
                                             + ", lossFn=" + lf + ", outputActivation=" + outputActivation
                                             + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1);
 //                            for (int j = 0; j < mln.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -342,7 +342,7 @@ public class GradientCheckTests extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testEmbeddingLayerSimple");
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -382,7 +382,7 @@ public class GradientCheckTests extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testEmbeddingLayerSimple");
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -472,7 +472,7 @@ public class GradientCheckTests extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             System.out.println(msg);
 //                            for (int j = 0; j < mln.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -714,7 +714,7 @@ public class GradientCheckTests extends BaseDL4JTest {
         // (a) activation function
         // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
         // (c) Loss function (with specified output activations)
-        // (d) Layer Normalization enabled / disabled
+        // (d) ILayer Normalization enabled / disabled
         Activation[] activFns = {Activation.SIGMOID, Activation.TANH};
         boolean[] characteristic = {true, false}; //If true: run some backprop steps first
 
@@ -776,7 +776,7 @@ public class GradientCheckTests extends BaseDL4JTest {
                                     + lf + ", outputActivation=" + outputActivation + ", doLearningFirst="
                                     + doLearningFirst + ", layerNorm=" + layerNorm);
 //                            for (int j = 0; j < mln.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                         }
 
                         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java
index ec99f3852..7718078a6 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/GradientCheckTestsComputationGraph.java
@@ -106,7 +106,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testBasicIris()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -157,7 +157,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testBasicIrisWithMerging()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -214,7 +214,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println("testBasicIrisWithElementWiseVertex(op=" + op + ")");
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -274,7 +274,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println("testBasicIrisWithElementWiseVertex(op=" + op + ")");
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -376,7 +376,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -439,7 +439,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -478,7 +478,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testLSTMWithSubset()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -515,7 +515,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testLSTMWithLastTimeStepVertex()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         //First: test with no input mask array
@@ -579,7 +579,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testLSTMWithDuplicateToTimeSeries()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input1, input2})
@@ -628,7 +628,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testLSTMWithReverseTimeSeriesVertex()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -683,7 +683,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(inputs)
@@ -723,7 +723,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -769,7 +769,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(input)
@@ -820,7 +820,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{input})
@@ -888,7 +888,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testBasicIrisTripletStackingL2Loss()");
 //            for (int j = 0; j < graph.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{pos, anc, neg})
@@ -949,7 +949,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(msg);
 //                    for (int j = 0; j < graph.getNumLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{example})
@@ -1014,7 +1014,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(msg);
 //                    for (int j = 0; j < net.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + net.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + net.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -1063,7 +1063,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1, in2})
@@ -1121,7 +1121,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1, in2})
@@ -1179,7 +1179,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1, in2})
@@ -1242,7 +1242,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             graph.setLayerMaskArrays(new INDArray[] {inMask1, inMask2}, null);
@@ -1301,7 +1301,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1, in2})
@@ -1347,7 +1347,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1})
@@ -1398,7 +1398,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < graph.getNumLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + graph.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + graph.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(graph).inputs(new INDArray[]{in1})
@@ -1436,7 +1436,7 @@ public class GradientCheckTestsComputationGraph extends BaseDL4JTest {
         if (PRINT_RESULTS) {
             System.out.println("testGraphEmbeddingLayerSimple");
 //            for (int j = 0; j < cg.getNumLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + cg.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + cg.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.GraphConfig().net(cg).inputs(new INDArray[]{input})
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LRNGradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LRNGradientCheckTests.java
index 9d982818a..87ea20cf5 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LRNGradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LRNGradientCheckTests.java
@@ -84,7 +84,7 @@ public class LRNGradientCheckTests extends BaseDL4JTest {
 
 //        if (PRINT_RESULTS) {
 //            for (int j = 0; j < mln.getnLayers(); j++)
-//                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
 //        }
 
         boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java
index c1e20d858..a2c7d7039 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/LSTMGradientCheckTests.java
@@ -126,7 +126,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(testName);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -215,7 +215,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(testName);
 //                    for (int j = 0; j < mln.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(new GradientCheckUtil.MLNConfig().net(mln).input(input)
@@ -343,7 +343,7 @@ public class LSTMGradientCheckTests extends BaseDL4JTest {
                             + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2
                             + ", l1=" + l1);
 //                    for (int j = 0; j < mln.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                 }
 
                 boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/NoBiasGradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/NoBiasGradientCheckTests.java
index 5cfec0631..477199be0 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/NoBiasGradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/NoBiasGradientCheckTests.java
@@ -78,7 +78,7 @@ public class NoBiasGradientCheckTests extends BaseDL4JTest {
 
                                     .dist(new NormalDistribution(0, 1))
                                     .activation(Activation.TANH)
-                                    .hasBias(true)  //Layer 0: Always have a bias
+                                    .hasBias(true)  //ILayer 0: Always have a bias
                                     .build())
                             .layer(1, new DenseLayer.Builder().nIn(layerSize).nOut(layerSize)
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/OutputLayerGradientChecks.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/OutputLayerGradientChecks.java
index 1c1da4cee..0928b52de 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/OutputLayerGradientChecks.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/OutputLayerGradientChecks.java
@@ -137,7 +137,7 @@ public class OutputLayerGradientChecks extends BaseDL4JTest {
                 if (PRINT_RESULTS) {
                     System.out.println(testName);
 //                    for (int j = 0; j < mln.getnLayers(); j++)
-//                        System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                        System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                 }
 
                 System.out.println("Starting test: " + testName);
@@ -244,7 +244,7 @@ public class OutputLayerGradientChecks extends BaseDL4JTest {
                     if (PRINT_RESULTS) {
                         System.out.println(testName);
 //                        for (int j = 0; j < mln.getnLayers(); j++)
-//                            System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                            System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                     }
 
                     System.out.println("Starting test: " + testName);
@@ -393,7 +393,7 @@ public class OutputLayerGradientChecks extends BaseDL4JTest {
                         if (PRINT_RESULTS) {
                             System.out.println(testName);
 //                            for (int j = 0; j < mln.getnLayers(); j++)
-//                                System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                                System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
                         }
 
                         System.out.println("Starting test: " + testName);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java
index 92ddf8622..40041885e 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/gradientcheck/VaeGradientCheckTests.java
@@ -124,7 +124,7 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
@@ -195,7 +195,7 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int l = 0; l < mln.getnLayers(); l++)
-//                    System.out.println("Layer " + l + " # params: " + mln.getLayer(l).numParams());
+//                    System.out.println("ILayer " + l + " # params: " + mln.getLayer(l).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradientsPretrainLayer(layer, DEFAULT_EPS,
@@ -283,7 +283,7 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradientsPretrainLayer(layer, DEFAULT_EPS,
@@ -325,7 +325,7 @@ public class VaeGradientCheckTests extends BaseDL4JTest {
             if (PRINT_RESULTS) {
                 System.out.println(msg);
 //                for (int j = 0; j < mln.getnLayers(); j++)
-//                    System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
+//                    System.out.println("ILayer " + j + " # params: " + mln.getLayer(j).numParams());
             }
 
             boolean gradOK = GradientCheckUtil.checkGradientsPretrainLayer(layer, DEFAULT_EPS,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/conf/layers/LayerConfigTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/conf/layers/LayerConfigTest.java
index 60b549714..be25a0ccd 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/conf/layers/LayerConfigTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/conf/layers/LayerConfigTest.java
@@ -133,8 +133,8 @@ public class LayerConfigTest extends BaseDL4JTest {
 
         //Learning rate without layerwise override:
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(0.3).list()
-                        .layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -143,8 +143,8 @@ public class LayerConfigTest extends BaseDL4JTest {
 
         //With:
         conf = new NeuralNetConfiguration.Builder().learningRate(0.3).list()
-                        .layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).learningRate(0.2).build()).build();
+                        .layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).learningRate(0.2).build()).build();
 
         net = new MultiLayerNetwork(conf);
         net.init();
@@ -154,8 +154,8 @@ public class LayerConfigTest extends BaseDL4JTest {
 
         //L1 and L2 without layerwise override:
         conf = new NeuralNetConfiguration.Builder().l1(0.1).l2(0.2).list()
-                        .layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -166,8 +166,8 @@ public class LayerConfigTest extends BaseDL4JTest {
 
         //L1 and L2 with layerwise override:
         conf = new NeuralNetConfiguration.Builder().l1(0.1).l2(0.2).list()
-                        .layer(0, new DenseLayer.Builder().nIn(2).nOut(2).l1(0.9).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).l2(0.8).build()).build();
+                        .layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).l1(0.9).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).l2(0.8).build()).build();
         net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -326,8 +326,8 @@ public class LayerConfigTest extends BaseDL4JTest {
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr)
                         .updater(Updater.SGD)
                         .learningRateDecayPolicy(LearningRatePolicy.Exponential).lrPolicyDecayRate(lrDecayRate).list()
-                        .layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -345,8 +345,8 @@ public class LayerConfigTest extends BaseDL4JTest {
         int iterations = 1;
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().iterations(iterations).learningRate(lr)
                         .learningRateDecayPolicy(LearningRatePolicy.Inverse).lrPolicyDecayRate(lrDecayRate)
-                        .lrPolicyPower(power).list().layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .lrPolicyPower(power).list().layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -367,8 +367,8 @@ public class LayerConfigTest extends BaseDL4JTest {
         int iterations = 1;
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().iterations(iterations).learningRate(lr)
                         .learningRateDecayPolicy(LearningRatePolicy.Step).lrPolicyDecayRate(lrDecayRate)
-                        .lrPolicySteps(steps).list().layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .lrPolicySteps(steps).list().layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -388,8 +388,8 @@ public class LayerConfigTest extends BaseDL4JTest {
         int iterations = 1;
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().iterations(iterations).learningRate(lr)
                         .learningRateDecayPolicy(LearningRatePolicy.Poly).lrPolicyDecayRate(lrDecayRate)
-                        .lrPolicyPower(power).list().layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .lrPolicyPower(power).list().layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
@@ -409,8 +409,8 @@ public class LayerConfigTest extends BaseDL4JTest {
         int iterations = 1;
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().iterations(iterations).learningRate(lr)
                         .learningRateDecayPolicy(LearningRatePolicy.Sigmoid).lrPolicyDecayRate(lrDecayRate)
-                        .lrPolicySteps(steps).list().layer(0, new DenseLayer.Builder().nIn(2).nOut(2).build())
-                        .layer(1, new DenseLayer.Builder().nIn(2).nOut(2).build()).build();
+                        .lrPolicySteps(steps).list().layer(0, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build())
+                        .layer(1, new DenseLayerConfiguration.Builder().nIn(2).nOut(2).build()).build();
         MultiLayerNetwork net = new MultiLayerNetwork(conf);
         net.init();
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
index b3e625849..edad9fb7d 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
@@ -229,7 +229,7 @@ public class DTypeTests extends BaseDL4JTest {
         if (seenLayers.size() < layerClasses.size()) {
             for (Class<?> c : layerClasses) {
                 if (!seenLayers.contains(c) && !ignoreClasses.contains(c)) {
-                    log.warn("Layer class not tested for global vs. network datatypes: {}", c);
+                    log.warn("ILayer class not tested for global vs. network datatypes: {}", c);
                     fail = true;
                 }
             }
@@ -279,7 +279,7 @@ public class DTypeTests extends BaseDL4JTest {
     }
 
     public static void logUsedClasses(ComputationGraph net) {
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         for (GraphVertex gv : conf.getVertices().values()) {
             seenVertices.add(gv.getClass());
             if (gv instanceof LayerVertex) {
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/ComputationGraphTestRNN.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/ComputationGraphTestRNN.java
index eb8c1cbcc..2d2379fdb 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/ComputationGraphTestRNN.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/ComputationGraphTestRNN.java
@@ -65,7 +65,7 @@ public class ComputationGraphTestRNN extends BaseDL4JTest {
         Nd4j.getRandom().setSeed(12345);
         int timeSeriesLength = 12;
 
-        //4 layer network: 2 GravesLSTM + DenseLayer + RnnOutputLayer. Hence also tests preprocessors.
+        //4 layer network: 2 GravesLSTM + DenseLayerConfiguration + RnnOutputLayer. Hence also tests preprocessors.
         ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345).graphBuilder()
                 .addInputs("in")
                 .addLayer("0", new org.deeplearning4j.nn.conf.layers.GravesLSTM.Builder().nIn(5).nOut(7)
@@ -208,7 +208,7 @@ public class ComputationGraphTestRNN extends BaseDL4JTest {
         Nd4j.getRandom().setSeed(12345);
         int timeSeriesLength = 12;
 
-        //4 layer network: 2 GravesLSTM + DenseLayer + RnnOutputLayer. Hence also tests preprocessors.
+        //4 layer network: 2 GravesLSTM + DenseLayerConfiguration + RnnOutputLayer. Hence also tests preprocessors.
         //Network architecture: lstm0 -> Dense -> RnnOutputLayer0
         // and lstm1 -> Dense -> RnnOutputLayer1
         ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345).graphBuilder()
@@ -391,9 +391,9 @@ public class ComputationGraphTestRNN extends BaseDL4JTest {
         graphTBPTT.init();
         graphTBPTT.clearTbpttState = false;
 
-        assertEquals(BackpropType.TruncatedBPTT, graphTBPTT.getConfiguration().getBackpropType());
-        assertEquals(timeSeriesLength, graphTBPTT.getConfiguration().getTbpttFwdLength());
-        assertEquals(timeSeriesLength, graphTBPTT.getConfiguration().getTbpttBackLength());
+        assertEquals(BackpropType.TruncatedBPTT, graphTBPTT.getComputationGraphConfiguration().getBackpropType());
+        assertEquals(timeSeriesLength, graphTBPTT.getComputationGraphConfiguration().getTbpttFwdLength());
+        assertEquals(timeSeriesLength, graphTBPTT.getComputationGraphConfiguration().getTbpttBackLength());
 
         INDArray inputData = Nd4j.rand(miniBatchSize, nIn, timeSeriesLength);
         INDArray labels = Nd4j.rand(miniBatchSize, nOut, timeSeriesLength);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestCompGraphUnsupervised.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestCompGraphUnsupervised.java
index a17979bf2..794538c36 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestCompGraphUnsupervised.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestCompGraphUnsupervised.java
@@ -42,7 +42,6 @@ import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.conditions.Conditions;
 import org.nd4j.linalg.learning.config.Adam;
 
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -168,8 +167,8 @@ public class TestCompGraphUnsupervised extends BaseDL4JTest {
             net.init();
 
             ComputationGraph cg = net.toComputationGraph();
-            cg.getConfiguration().setInferenceWorkspaceMode(wsm);
-            cg.getConfiguration().setTrainingWorkspaceMode(wsm);
+            cg.getComputationGraphConfiguration().setInferenceWorkspaceMode(wsm);
+            cg.getComputationGraphConfiguration().setTrainingWorkspaceMode(wsm);
             DataSetIterator ds = new EarlyTerminationDataSetIterator(new MnistDataSetIterator(1, true, 12345), 1);
             Nd4j.getRandom().setSeed(12345);
             net.pretrainLayer(0, ds);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
index 7a918a674..a6373c6a9 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
@@ -1033,15 +1033,15 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
 
         DataSetIterator iter = new IrisDataSetIterator(50, 150);
 
-        assertEquals(0, network.getConfiguration().getIterationCount());
+        assertEquals(0, network.getComputationGraphConfiguration().getIterationCount());
         network.fit(iter);
-        assertEquals(3, network.getConfiguration().getIterationCount());
+        assertEquals(3, network.getComputationGraphConfiguration().getIterationCount());
         iter.reset();
         network.fit(iter);
-        assertEquals(6, network.getConfiguration().getIterationCount());
+        assertEquals(6, network.getComputationGraphConfiguration().getIterationCount());
         iter.reset();
         network.fit(iter.next());
-        assertEquals(7, network.getConfiguration().getIterationCount());
+        assertEquals(7, network.getComputationGraphConfiguration().getIterationCount());
 
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         ModelSerializer.writeModel(network, baos, true);
@@ -1049,7 +1049,7 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
 
         ByteArrayInputStream bais = new ByteArrayInputStream(asBytes);
         ComputationGraph net = ModelSerializer.restoreComputationGraph(bais, true);
-        assertEquals(7, net.getConfiguration().getIterationCount());
+        assertEquals(7, net.getComputationGraphConfiguration().getIterationCount());
     }
 
     @Test
@@ -1272,18 +1272,18 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         ComputationGraph net = new ComputationGraph(conf);
         net.init();
 
-        assertEquals(0, net.getConfiguration().getEpochCount());
+        assertEquals(0, net.getComputationGraphConfiguration().getEpochCount());
 
 
         DataSetIterator iter = new IrisDataSetIterator(150, 150);
 
         for( int i=0; i<4; i++ ){
-            assertEquals(i, net.getConfiguration().getEpochCount());
+            assertEquals(i, net.getComputationGraphConfiguration().getEpochCount());
             net.fit(iter);
-            assertEquals(i+1, net.getConfiguration().getEpochCount());
+            assertEquals(i+1, net.getComputationGraphConfiguration().getEpochCount());
         }
 
-        assertEquals(4, net.getConfiguration().getEpochCount());
+        assertEquals(4, net.getComputationGraphConfiguration().getEpochCount());
 
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
 
@@ -1293,7 +1293,7 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
 
         ComputationGraph restored = ModelSerializer.restoreComputationGraph(bais, true);
-        assertEquals(4, restored.getConfiguration().getEpochCount());
+        assertEquals(4, restored.getComputationGraphConfiguration().getEpochCount());
     }
 
     @Test
@@ -1619,13 +1619,13 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         GraphIndices indices = cg.calculateIndices();
 
         int[] order = cg.topologicalSortOrder();
-        List<String> strOrder = cg.getConfiguration().getTopologicalOrderStr();
+        List<String> strOrder = cg.getComputationGraphConfiguration().getTopologicalOrderStr();
         INDArray[] out1 = cg.output(in);
 
         //Check it's the same after loading:
         ComputationGraph cg2 = TestUtils.testModelSerialization(cg);
         int[] order2 = cg2.topologicalSortOrder();
-        List<String> strOrder2 = cg.getConfiguration().getTopologicalOrderStr();
+        List<String> strOrder2 = cg.getComputationGraphConfiguration().getTopologicalOrderStr();
         assertArrayEquals(order, order2);
         assertEquals(strOrder, strOrder2);
 
@@ -1633,7 +1633,7 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         assertArrayEquals(out1, out2);
 
         //Delete the topological order, ensure it gets recreated properly:
-        ComputationGraphConfiguration conf3 = cg2.getConfiguration().clone();
+        ComputationGraphConfiguration conf3 = cg2.getComputationGraphConfiguration().clone();
         conf3.setTopologicalOrder(null);
         conf3.setTopologicalOrderStr(null);
         ComputationGraph cg3 = new ComputationGraph(conf3);
@@ -1641,7 +1641,7 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         cg3.setParams(cg2.params());
 
         int[] order3 = cg3.topologicalSortOrder();
-        List<String> strOrder3 = cg.getConfiguration().getTopologicalOrderStr();
+        List<String> strOrder3 = cg.getComputationGraphConfiguration().getTopologicalOrderStr();
         INDArray[] out3 = cg3.output(in);
         assertArrayEquals(order, order3);
         assertEquals(strOrder, strOrder3);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/FrozenLayerTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/FrozenLayerTest.java
index c3543e167..0f506dbfe 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/FrozenLayerTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/FrozenLayerTest.java
@@ -235,7 +235,7 @@ public class FrozenLayerTest extends BaseDL4JTest {
         ComputationGraph clonedModel = modelNow.clone();
 
         //Check json
-        assertEquals(clonedModel.getConfiguration().toJson(), modelNow.getConfiguration().toJson());
+        assertEquals(clonedModel.getComputationGraphConfiguration().toJson(), modelNow.getComputationGraphConfiguration().toJson());
 
         //Check params
         assertEquals(modelNow.params(), clonedModel.params());
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/TestDropout.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/TestDropout.java
index 67f66fb21..868f34ba7 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/TestDropout.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/TestDropout.java
@@ -50,7 +50,7 @@ public class TestDropout extends BaseDL4JTest {
     @Test
     public void testDropoutSimple() throws Exception {
         //Testing dropout with a single layer
-        //Layer input: values should be set to either 0.0 or 2.0x original value
+        //ILayer input: values should be set to either 0.0 or 2.0x original value
 
         int nIn = 8;
         int nOut = 8;
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/feedforward/embedding/EmbeddingLayerTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/feedforward/embedding/EmbeddingLayerTest.java
index 259a38382..55c26b12b 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/feedforward/embedding/EmbeddingLayerTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/feedforward/embedding/EmbeddingLayerTest.java
@@ -200,7 +200,7 @@ public class EmbeddingLayerTest extends BaseDL4JTest {
     @Test
     public void testEmbeddingForwardPass() {
         //With the same parameters, embedding layer should have same activations as the equivalent one-hot representation
-        // input with a DenseLayer
+        // input with a DenseLayerConfiguration
 
         int nClassesIn = 10;
 
@@ -243,7 +243,7 @@ public class EmbeddingLayerTest extends BaseDL4JTest {
     @Test
     public void testEmbeddingBackwardPass() {
         //With the same parameters, embedding layer should have same activations as the equivalent one-hot representation
-        // input with a DenseLayer
+        // input with a DenseLayerConfiguration
 
         int nClassesIn = 10;
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/ocnn/OCNNOutputLayerTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/ocnn/OCNNOutputLayerTest.java
index e9f76dfc2..0eaa156f1 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/ocnn/OCNNOutputLayerTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/ocnn/OCNNOutputLayerTest.java
@@ -104,7 +104,7 @@ public class OCNNOutputLayerTest extends BaseDL4JTest {
                     + "ocnn"  + "sigmoid" + ", doLearningFirst="
                     + doLearningFirst);
             for (int j = 0; j < network.getnLayers(); j++)
-                System.out.println("Layer " + j + " # params: " + network.getLayer(j).numParams());
+                System.out.println("ILayer " + j + " # params: " + network.getLayer(j).numParams());
         }
 
         boolean gradOK = GradientCheckUtil.checkGradients(network, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
index e84390916..3595282c0 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
@@ -98,7 +98,7 @@ public class SameDiffDense extends SameDiffLayer {
                 if(DefaultParamInitializer.BIAS_KEY.equals(e.getKey())){
                     e.getValue().assign(0.0);
                 } else {
-                    //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayer
+                    //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayerConfiguration
                     WeightInitUtil.initWeights(nIn, nOut, new long[]{nIn, nOut}, weightInit, null, 'f', e.getValue());
                 }
             }
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDenseVertex.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDenseVertex.java
index da674ea7c..baa4cee7e 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDenseVertex.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDenseVertex.java
@@ -72,14 +72,14 @@ public class SameDiffDenseVertex extends SameDiffVertex {
 
     @Override
     public void initializeParameters(Map<String, INDArray> params) {
-        //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayer
+        //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayerConfiguration
         WeightInitUtil.initWeights(nIn, nOut, new long[]{nIn, nOut}, weightInit, null, 'f', params.get("W"));
         params.get("b").assign(0.0);
     }
 
     @Override
     public char paramReshapeOrder(String paramName){
-        return 'f';     //To match DL4J DenseLayer - for easy comparison
+        return 'f';     //To match DL4J DenseLayerConfiguration - for easy comparison
     }
 
     @Override
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/misc/WorkspaceTests.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/misc/WorkspaceTests.java
index cf7d31bd5..5b00685af 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/misc/WorkspaceTests.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/misc/WorkspaceTests.java
@@ -73,8 +73,8 @@ public class WorkspaceTests extends BaseDL4JTest {
         ComputationGraph c = createNet();
         for (WorkspaceMode wm : new WorkspaceMode[]{WorkspaceMode.NONE, WorkspaceMode.ENABLED}) {
             log.info("Starting test: {}", wm);
-            c.getConfiguration().setTrainingWorkspaceMode(wm);
-            c.getConfiguration().setInferenceWorkspaceMode(wm);
+            c.getComputationGraphConfiguration().setTrainingWorkspaceMode(wm);
+            c.getComputationGraphConfiguration().setInferenceWorkspaceMode(wm);
 
             INDArray f = Nd4j.rand(8, 1, 28, 28);
             INDArray l = Nd4j.rand(8, 10);
@@ -666,8 +666,8 @@ public class WorkspaceTests extends BaseDL4JTest {
         ComputationGraph c = createNet();
         for (WorkspaceMode wm : new WorkspaceMode[]{WorkspaceMode.NONE, WorkspaceMode.ENABLED}) {
             log.info("Starting test: {}", wm);
-            c.getConfiguration().setTrainingWorkspaceMode(wm);
-            c.getConfiguration().setInferenceWorkspaceMode(wm);
+            c.getComputationGraphConfiguration().setTrainingWorkspaceMode(wm);
+            c.getComputationGraphConfiguration().setInferenceWorkspaceMode(wm);
 
             INDArray f = Nd4j.rand(8, 1, 28, 28);
             INDArray l = Nd4j.rand(8, 10);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTest.java
index 056f4a43e..49d70647c 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTest.java
@@ -995,7 +995,7 @@ public class MultiLayerTest extends BaseDL4JTest {
 
     @Test
     public void testCompareLayerMethods(){
-        //Simple test: compare .layer(int, Layer) and .layer(Layer) are identical
+        //Simple test: compare .layer(int, ILayer) and .layer(ILayer) are identical
 
         MultiLayerConfiguration conf1 = new NeuralNetConfiguration.Builder().seed(123).list()
                 .layer(0, new DenseLayer.Builder().nIn(4).nOut(3).weightInit(WeightInit.XAVIER)
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTestRNN.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTestRNN.java
index 5064e44ab..a12bd88f9 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTestRNN.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/multilayer/MultiLayerTestRNN.java
@@ -261,7 +261,7 @@ public class MultiLayerTestRNN extends BaseDL4JTest {
             Nd4j.getRandom().setSeed(12345);
             int timeSeriesLength = 12;
 
-            //4 layer network: 2 GravesLSTM + DenseLayer + RnnOutputLayer. Hence also tests preprocessors.
+            //4 layer network: 2 GravesLSTM + DenseLayerConfiguration + RnnOutputLayer. Hence also tests preprocessors.
             MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345).list()
                     .layer(0, l0)
                     .layer(1, l1)
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/rl/TestMultiModelGradientApplication.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/rl/TestMultiModelGradientApplication.java
index 410abf970..92b8375dd 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/rl/TestMultiModelGradientApplication.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/rl/TestMultiModelGradientApplication.java
@@ -216,8 +216,8 @@ public class TestMultiModelGradientApplication extends BaseDL4JTest {
                                 net2GradUpd.getUpdater().getStateViewArray());
 
                 //Remove the next 2 lines: fails - as net 1 is 1 iteration ahead
-                net1GradCalc.getConfiguration().setIterationCount(0);
-                net2GradUpd.getConfiguration().setIterationCount(0);
+                net1GradCalc.getComputationGraphConfiguration().setIterationCount(0);
+                net2GradUpd.getComputationGraphConfiguration().setIterationCount(0);
 
 
                 for (int i = 0; i < 100; i++) {
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TestTransferLearningModelSerializer.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TestTransferLearningModelSerializer.java
index ad92a7c47..44c3bcb07 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TestTransferLearningModelSerializer.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TestTransferLearningModelSerializer.java
@@ -120,7 +120,7 @@ public class TestTransferLearningModelSerializer extends BaseDL4JTest {
         assertTrue(withFrozen.getLayer(0) instanceof FrozenLayer);
         assertTrue(withFrozen.getLayer(1) instanceof FrozenLayer);
 
-        Map<String, GraphVertex> m = withFrozen.getConfiguration().getVertices();
+        Map<String, GraphVertex> m = withFrozen.getComputationGraphConfiguration().getVertices();
         Layer l0 = ((LayerVertex) m.get("0")).getLayerConf().getLayer();
         Layer l1 = ((LayerVertex) m.get("1")).getLayerConf().getLayer();
         assertTrue(l0 instanceof org.deeplearning4j.nn.conf.layers.misc.FrozenLayer);
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningCompGraphTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningCompGraphTest.java
index a81d96838..efc821b6e 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningCompGraphTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningCompGraphTest.java
@@ -102,7 +102,7 @@ public class TransferLearningCompGraphTest extends BaseDL4JTest {
                                         .build();
 
         //Check json
-        assertEquals(expectedConf.toJson(), modelNow.getConfiguration().toJson());
+        assertEquals(expectedConf.toJson(), modelNow.getComputationGraphConfiguration().toJson());
 
         //Check params after fit
         modelNow.fit(randomData);
@@ -382,7 +382,7 @@ public class TransferLearningCompGraphTest extends BaseDL4JTest {
         modelExpectedArch.getVertex("layer0").setLayerAsFrozen();
         modelExpectedArch.getVertex("layer1").setLayerAsFrozen();
 
-        assertEquals(modelExpectedArch.getConfiguration().toJson(), modelNow.getConfiguration().toJson());
+        assertEquals(modelExpectedArch.getComputationGraphConfiguration().toJson(), modelNow.getComputationGraphConfiguration().toJson());
 
         modelNow.setParams(modelExpectedArch.params());
         int i = 0;
@@ -445,7 +445,7 @@ public class TransferLearningCompGraphTest extends BaseDL4JTest {
 
 
 //        assertEquals(confExpected, graph.getConfiguration());
-        assertEquals(confExpected.toJson(), graph.getConfiguration().toJson());
+        assertEquals(confExpected.toJson(), graph.getComputationGraphConfiguration().toJson());
     }
 
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelperTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelperTest.java
index 0e78a3d6c..d7e58be43 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelperTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelperTest.java
@@ -126,7 +126,7 @@ public class TransferLearningHelperTest extends BaseDL4JTest {
                                         .setOutputs("outLeft", "outCentre", "outRight").build();
         ComputationGraph expectedModel = new ComputationGraph(expectedConf);
         expectedModel.init();
-        assertEquals(expectedConf.toJson(), modelSubset.getConfiguration().toJson());
+        assertEquals(expectedConf.toJson(), modelSubset.getComputationGraphConfiguration().toJson());
     }
 
     @Test
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java
index 5b7bec134..73e1a7a56 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/optimize/solver/TestOptimizers.java
@@ -764,7 +764,7 @@ public class TestOptimizers extends BaseDL4JTest {
     }
 
 
-    /** Simple abstract class to deal with the fact that we don't care about the majority of the Model/Layer
+    /** Simple abstract class to deal with the fact that we don't care about the majority of the Model/ILayer
      * methods here. Classes extending this model for optimizer tests need only implement the score() and
      * gradient() methods.
      */
@@ -907,7 +907,7 @@ public class TestOptimizers extends BaseDL4JTest {
 
         @Override
         public INDArray input() {
-            //Work-around for BaseUpdater.postApply(): Uses Layer.input().size(0)
+            //Work-around for BaseUpdater.postApply(): Uses ILayer.input().size(0)
             //in order to get mini-batch size. i.e., divide by 1 here.
             return Nd4j.zeros(1);
         }
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest060.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest060.java
index 985f347d8..87a53e54a 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest060.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest060.java
@@ -221,7 +221,7 @@ public class RegressionTest060 extends BaseDL4JTest {
 
         ComputationGraph net = ModelSerializer.restoreComputationGraph(f, true);
 
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         assertEquals(3, conf.getVertices().size());
 
         GravesLSTM l0 = (GravesLSTM) ((LayerVertex) conf.getVertices().get("0")).getLayerConf().getLayer();
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest071.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest071.java
index 2a75e7994..0dc3839bb 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest071.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest071.java
@@ -221,7 +221,7 @@ public class RegressionTest071 extends BaseDL4JTest {
 
         ComputationGraph net = ModelSerializer.restoreComputationGraph(f, true);
 
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         assertEquals(3, conf.getVertices().size());
 
         GravesLSTM l0 = (GravesLSTM) ((LayerVertex) conf.getVertices().get("0")).getLayerConf().getLayer();
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest080.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest080.java
index 6566f03fe..6460582ba 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest080.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest080.java
@@ -237,7 +237,7 @@ public class RegressionTest080 extends BaseDL4JTest {
 
         ComputationGraph net = ModelSerializer.restoreComputationGraph(f, true);
 
-        ComputationGraphConfiguration conf = net.getConfiguration();
+        ComputationGraphConfiguration conf = net.getComputationGraphConfiguration();
         assertEquals(3, conf.getVertices().size());
 
         GravesLSTM l0 = (GravesLSTM) ((LayerVertex) conf.getVertices().get("0")).getLayerConf().getLayer();
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100a.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100a.java
index acee54871..f294e16a7 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100a.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100a.java
@@ -171,7 +171,7 @@ public class RegressionTest100a extends BaseDL4JTest {
         int nBoxes = 5;
         int nClasses = 10;
 
-        ConvolutionLayer cl = (ConvolutionLayer)((LayerVertex)net.getConfiguration().getVertices().get("convolution2d_9")).getLayerConf().getLayer();
+        ConvolutionLayer cl = (ConvolutionLayer)((LayerVertex)net.getComputationGraphConfiguration().getVertices().get("convolution2d_9")).getLayerConf().getLayer();
         assertEquals(nBoxes * (5 + nClasses), cl.getNOut());
         assertEquals(new ActivationIdentity(), cl.getActivationFn());
         assertEquals(ConvolutionMode.Same, cl.getConvolutionMode());
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b3.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b3.java
index 8df2f258b..35fb7391b 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b3.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b3.java
@@ -206,7 +206,7 @@ public class RegressionTest100b3 extends BaseDL4JTest {
         int nBoxes = 5;
         int nClasses = 10;
 
-        ConvolutionLayer cl = (ConvolutionLayer)((LayerVertex)net.getConfiguration().getVertices().get("convolution2d_9")).getLayerConf().getLayer();
+        ConvolutionLayer cl = (ConvolutionLayer)((LayerVertex)net.getComputationGraphConfiguration().getVertices().get("convolution2d_9")).getLayerConf().getLayer();
         assertEquals(nBoxes * (5 + nClasses), cl.getNOut());
         assertEquals(new ActivationIdentity(), cl.getActivationFn());
         assertEquals(ConvolutionMode.Same, cl.getConvolutionMode());
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
index 5b4270a4e..00e46bf0c 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
@@ -224,7 +224,7 @@ public class RegressionTest100b4 extends BaseDL4JTest {
         int nBoxes = 5;
         int nClasses = 10;
 
-        ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getConfiguration().getVertices()
+        ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getComputationGraphConfiguration().getVertices()
                 .get("convolution2d_9")).getLayerConf().getLayer();
         assertEquals(nBoxes * (5 + nClasses), cl.getNOut());
         assertEquals(new ActivationIdentity(), cl.getActivationFn());
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java
index 40df45924..15a9c2bc3 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b6.java
@@ -205,7 +205,7 @@ public class RegressionTest100b6 extends BaseDL4JTest {
         int nBoxes = 5;
         int nClasses = 10;
 
-        ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getConfiguration().getVertices()
+        ConvolutionLayer cl = (ConvolutionLayer) ((LayerVertex) net.getComputationGraphConfiguration().getVertices()
                 .get("convolution2d_9")).getLayerConf().getLayer();
         assertEquals(nBoxes * (5 + nClasses), cl.getNOut());
         assertEquals(new ActivationIdentity(), cl.getActivationFn());
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/customlayer100a/CustomLayer.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/customlayer100a/CustomLayer.java
index 00a2b6242..acb3963b1 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/customlayer100a/CustomLayer.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/regressiontest/customlayer100a/CustomLayer.java
@@ -94,7 +94,7 @@ public class CustomLayer extends FeedForwardLayer {
     @Override
     public ParamInitializer initializer() {
         //This method returns the parameter initializer for this type of layer
-        //In this case, we can use the DefaultParamInitializer, which is the same one used for DenseLayer
+        //In this case, we can use the DefaultParamInitializer, which is the same one used for DenseLayerConfiguration
         //For more complex layers, you may need to implement a custom parameter initializer
         //See the various parameter initializers here:
         //https://github.com/deeplearning4j/deeplearning4j/tree/master/deeplearning4j-core/src/main/java/org/deeplearning4j/nn/params
@@ -108,7 +108,7 @@ public class CustomLayer extends FeedForwardLayer {
         //If you don't need this functionality for your custom layer, you can return a LayerMemoryReport
         // with all 0s, or
 
-        //This implementation: based on DenseLayer implementation
+        //This implementation: based on DenseLayerConfiguration implementation
         InputType outputType = getOutputType(-1, inputType);
 
         val numParams = initializer().numParams(this);
@@ -131,7 +131,7 @@ public class CustomLayer extends FeedForwardLayer {
                 .workingMemory(0, 0, trainSizeFixed,
                         trainSizeVariable)     //No additional memory (beyond activations) for inference
                 .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS,
-                        MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer
+                        MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayerConfiguration
                 .build();
     }
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/CrashReportingUtilTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/CrashReportingUtilTest.java
index 4da9883b8..8bfaa9eb2 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/CrashReportingUtilTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/CrashReportingUtilTest.java
@@ -117,7 +117,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
         String str = FileUtils.readFileToString(list[0]);
 //        System.out.println(str);
         assertTrue(str.contains("Network Information"));
-        assertTrue(str.contains("Layer Helpers"));
+        assertTrue(str.contains("ILayer Helpers"));
         assertTrue(str.contains("JavaCPP"));
         assertTrue(str.contains("ScoreIterationListener"));
 
@@ -134,7 +134,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
         assertEquals(1, list.length);
         str = FileUtils.readFileToString(list[0]);
         assertTrue(str.contains("Network Information"));
-        assertTrue(str.contains("Layer Helpers"));
+        assertTrue(str.contains("ILayer Helpers"));
         assertTrue(str.contains("JavaCPP"));
         assertTrue(str.contains("ScoreIterationListener(1)"));
 
@@ -150,7 +150,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
 //        System.out.println("///////////////////////////////////////////////////////////");
 
         assertTrue(mlnMemoryInfo.contains("Network Information"));
-        assertTrue(mlnMemoryInfo.contains("Layer Helpers"));
+        assertTrue(mlnMemoryInfo.contains("ILayer Helpers"));
         assertTrue(mlnMemoryInfo.contains("JavaCPP"));
         assertTrue(mlnMemoryInfo.contains("ScoreIterationListener(1)"));
 
@@ -172,7 +172,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
         assertEquals(1, list.length);
         str = FileUtils.readFileToString(list[0]);
         assertTrue(str.contains("Network Information"));
-        assertTrue(str.contains("Layer Helpers"));
+        assertTrue(str.contains("ILayer Helpers"));
         assertTrue(str.contains("JavaCPP"));
         assertTrue(str.contains("ScoreIterationListener(1)"));
 
@@ -187,7 +187,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
         assertEquals(1, list.length);
         str = FileUtils.readFileToString(list[0]);
         assertTrue(str.contains("Network Information"));
-        assertTrue(str.contains("Layer Helpers"));
+        assertTrue(str.contains("ILayer Helpers"));
         assertTrue(str.contains("JavaCPP"));
         assertTrue(str.contains("ScoreIterationListener(1)"));
 
@@ -203,7 +203,7 @@ public class CrashReportingUtilTest extends BaseDL4JTest {
 //        System.out.println("///////////////////////////////////////////////////////////");
 
         assertTrue(cgMemoryInfo.contains("Network Information"));
-        assertTrue(cgMemoryInfo.contains("Layer Helpers"));
+        assertTrue(cgMemoryInfo.contains("ILayer Helpers"));
         assertTrue(cgMemoryInfo.contains("JavaCPP"));
         assertTrue(cgMemoryInfo.contains("ScoreIterationListener(1)"));
 
diff --git a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/ModelSerializerTest.java b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/ModelSerializerTest.java
index 610cb0961..e01d42f01 100644
--- a/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/ModelSerializerTest.java
+++ b/cavis-dnn/cavis-dnn-core/src/test/java/org/deeplearning4j/util/ModelSerializerTest.java
@@ -151,7 +151,7 @@ public class ModelSerializerTest extends BaseDL4JTest {
 
         ComputationGraph network = ModelSerializer.restoreComputationGraph(tempFile);
 
-        assertEquals(network.getConfiguration().toJson(), cg.getConfiguration().toJson());
+        assertEquals(network.getComputationGraphConfiguration().toJson(), cg.getComputationGraphConfiguration().toJson());
         assertEquals(cg.params(), network.params());
         assertEquals(cg.getUpdater().getStateViewArray(), network.getUpdater().getStateViewArray());
     }
@@ -177,7 +177,7 @@ public class ModelSerializerTest extends BaseDL4JTest {
 
         ComputationGraph network = ModelSerializer.restoreComputationGraph(fis);
 
-        assertEquals(network.getConfiguration().toJson(), cg.getConfiguration().toJson());
+        assertEquals(network.getComputationGraphConfiguration().toJson(), cg.getComputationGraphConfiguration().toJson());
         assertEquals(cg.params(), network.params());
         assertEquals(cg.getUpdater().getStateViewArray(), network.getUpdater().getStateViewArray());
     }
diff --git a/cavis-dnn/cavis-dnn-cudnn/src/main/java/org/deeplearning4j/cuda/recurrent/CudnnLSTMHelper.java b/cavis-dnn/cavis-dnn-cudnn/src/main/java/org/deeplearning4j/cuda/recurrent/CudnnLSTMHelper.java
index 120078d07..2b71d920a 100644
--- a/cavis-dnn/cavis-dnn-cudnn/src/main/java/org/deeplearning4j/cuda/recurrent/CudnnLSTMHelper.java
+++ b/cavis-dnn/cavis-dnn-cudnn/src/main/java/org/deeplearning4j/cuda/recurrent/CudnnLSTMHelper.java
@@ -198,7 +198,7 @@ public class CudnnLSTMHelper extends BaseCudnnHelper implements LSTMHelper {
         }
         if (!(activationFn instanceof ActivationTanH)) {
             supported = false;
-            log.warn("Not supported: Layer activation functions != ActivationTanH");
+            log.warn("Not supported: ILayer activation functions != ActivationTanH");
         }
         if (hasPeepholeConnections) {
             supported = false;
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasLayer.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasLayer.java
index 5c8c829c4..601237b53 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasLayer.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasLayer.java
@@ -295,7 +295,7 @@ public class KerasLayer {
     }
 
     /**
-     * Copy Keras layer weights to DL4J Layer.
+     * Copy Keras layer weights to DL4J ILayer.
      *
      * @param layer DL4J layer
      * @throws InvalidKerasConfigurationException Invalid Keras configuration
@@ -358,7 +358,7 @@ public class KerasLayer {
     }
 
     /**
-     * Whether this Keras layer maps to a DL4J Layer.
+     * Whether this Keras layer maps to a DL4J ILayer.
      *
      * @return true or false
      */
@@ -367,9 +367,9 @@ public class KerasLayer {
     }
 
     /**
-     * Gets corresponding DL4J Layer, if any.
+     * Gets corresponding DL4J ILayer, if any.
      *
-     * @return DL4J Layer
+     * @return DL4J ILayer
      * @see org.deeplearning4j.nn.api.Layer
      */
     public Layer getLayer() {
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasModel.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasModel.java
index d4bf6ba92..ea0b99f0c 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasModel.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasModel.java
@@ -583,8 +583,8 @@ public class KerasModel {
                     graphBuilder.addVertex(layer.getLayerName(), layer.getVertex(), inboundLayerNamesArray);
                 } else if (layer.isInputPreProcessor()) {
                     if (preprocessor == null)
-                        throw new UnsupportedKerasConfigurationException("Layer " + layer.getLayerName()
-                                + " could not be mapped to Layer, Vertex, or InputPreProcessor");
+                        throw new UnsupportedKerasConfigurationException("ILayer " + layer.getLayerName()
+                                + " could not be mapped to ILayer, Vertex, or InputPreProcessor");
                     graphBuilder.addVertex(layer.getLayerName(), new PreprocessorVertex(preprocessor),
                             inboundLayerNamesArray);
                 }
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/config/KerasLayerConfiguration.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/config/KerasLayerConfiguration.java
index a0082f4f1..d454d1e97 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/config/KerasLayerConfiguration.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/config/KerasLayerConfiguration.java
@@ -246,7 +246,7 @@ public class KerasLayerConfiguration {
     private final String LAYER_FIELD_RATE = "rate";
     private final String LAYER_FIELD_GAUSSIAN_VARIANCE = ""; // 1: sigma, 2: stddev
 
-    /* Layer wrappers */
+    /* ILayer wrappers */
     // Missing: TimeDistributed
 
 
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
index 9eae1f08e..f49599ccf 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
@@ -115,9 +115,9 @@ public class KerasDense extends KerasLayer {
     }
 
     /**
-     * Get DL4J DenseLayer.
+     * Get DL4J DenseLayerConfiguration.
      *
-     * @return DenseLayer
+     * @return DenseLayerConfiguration
      */
     public DenseLayer getDenseLayer() {
         return (DenseLayer) this.layer;
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
index 4e35a6867..e1c6be765 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
@@ -211,10 +211,10 @@ public class KerasLSTM extends KerasLayer {
     }
 
     /**
-     * Get DL4J Layer. If returnSequences is true, this can be casted to an "LSTM" layer, otherwise it can be casted
+     * Get DL4J ILayer. If returnSequences is true, this can be casted to an "LSTM" layer, otherwise it can be casted
      * to a "LastTimeStep" layer.
      *
-     * @return LSTM Layer
+     * @return LSTM ILayer
      */
     public Layer getLSTMLayer() {
         return layer;
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
index ac2d4c234..ea71fc8d7 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
@@ -184,7 +184,7 @@ public class KerasSimpleRnn extends KerasLayer {
     /**
      * Get DL4J SimpleRnn layer.
      *
-     * @return SimpleRnn Layer
+     * @return SimpleRnn ILayer
      */
     public Layer getSimpleRnnLayer() {
         return this.layer;
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java
index fa5f5b508..ccbbbd9d6 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java
@@ -160,7 +160,7 @@ public class KerasBidirectional extends KerasLayer {
     /**
      * Return the underlying recurrent layer of this bidirectional layer
      *
-     * @return Layer, recurrent layer
+     * @return ILayer, recurrent layer
      */
     public Layer getUnderlyingRecurrentLayer() {
         return kerasRnnlayer.getLayer();
@@ -169,7 +169,7 @@ public class KerasBidirectional extends KerasLayer {
     /**
      * Get DL4J Bidirectional layer.
      *
-     * @return Bidirectional Layer
+     * @return Bidirectional ILayer
      */
     public Bidirectional getBidirectionalLayer() {
         return (Bidirectional) this.layer;
diff --git a/cavis-dnn/cavis-dnn-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java b/cavis-dnn/cavis-dnn-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
index 1120dfbb8..f50df5084 100644
--- a/cavis-dnn/cavis-dnn-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
+++ b/cavis-dnn/cavis-dnn-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
@@ -85,7 +85,7 @@ public class FullModelComparisons extends BaseDL4JTest {
 
         System.out.println(model.summary());
 
-        // 1. Layer
+        // 1. ILayer
         LSTM firstLstm = (LSTM) model.getLayer(0);
         org.deeplearning4j.nn.conf.layers.LSTM firstConf =
                 (org.deeplearning4j.nn.conf.layers.LSTM) firstLstm.conf().getLayer();
@@ -123,7 +123,7 @@ public class FullModelComparisons extends BaseDL4JTest {
         Assertions.assertEquals(b.getDouble(0, 192), -0.13569744, 1e-7); // Keras O
         Assertions.assertEquals(b.getDouble(0, 0), -0.2587392, 1e-7); // Keras C
 
-        // 2. Layer
+        // 2. ILayer
         LSTM secondLstm = (LSTM) ((LastTimeStepLayer) model.getLayer(1)).getUnderlying();
         org.deeplearning4j.nn.conf.layers.LSTM secondConf =
                 (org.deeplearning4j.nn.conf.layers.LSTM) secondLstm.conf().getLayer();
diff --git a/cavis-dnn/cavis-dnn-nn-api/src/main/java/net/brutex/ai/dnn/api/LayerConfiguration.java b/cavis-dnn/cavis-dnn-nn-api/src/main/java/net/brutex/ai/dnn/api/LayerConfiguration.java
index 0b274cb8c..6b395a5b2 100644
--- a/cavis-dnn/cavis-dnn-nn-api/src/main/java/net/brutex/ai/dnn/api/LayerConfiguration.java
+++ b/cavis-dnn/cavis-dnn-nn-api/src/main/java/net/brutex/ai/dnn/api/LayerConfiguration.java
@@ -39,4 +39,13 @@ public interface LayerConfiguration {
    */
   org.deeplearning4j.nn.conf.inputs.InputType.Type getInputType();
 
+
+  /**
+   * Defines the valid input type for this Layer
+   *
+   * @return InputType
+   */
+  org.deeplearning4j.nn.conf.inputs.InputType.Type getOutputType();
+
+
 }
diff --git a/cavis-dnn/cavis-dnn-nn/build.gradle b/cavis-dnn/cavis-dnn-nn/build.gradle
index e0f85570d..0e097093d 100644
--- a/cavis-dnn/cavis-dnn-nn/build.gradle
+++ b/cavis-dnn/cavis-dnn-nn/build.gradle
@@ -22,7 +22,7 @@ apply from: "${project.rootProject.projectDir}/createTestBackends.gradle"
 
 dependencies {
     implementation platform(projects.cavisCommonPlatform)
-    implementation projects.cavisDnn.cavisDnnNnApi
+//    implementation projects.cavisDnn.cavisDnnNnApi
     implementation projects.cavisDnn.cavisDnnData.cavisDnnDataUtilityIterators
     implementation 'org.lucee:oswego-concurrent:1.3.4'
     implementation projects.cavisDnn.cavisDnnCommon
@@ -57,4 +57,6 @@ dependencies {
     // define any required OkHttp artifacts without version
     implementation "com.squareup.okhttp3:okhttp"
     implementation "com.squareup.okhttp3:logging-interceptor"
-}
\ No newline at end of file
+}
+sourceCompatibility = JavaVersion.VERSION_11
+targetCompatibility = JavaVersion.VERSION_11
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/LayerConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayer.java
similarity index 60%
rename from cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/LayerConfiguration.java
rename to cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayer.java
index 16c67b491..a43b94265 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/LayerConfiguration.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayer.java
@@ -19,10 +19,28 @@
  *
  */
 
-package net.brutex.ai.dnn.conf.layer;
+package net.brutex.ai.dnn.api;
 
-public abstract class LayerConfiguration {
+/**
+ * This is an "executable" ILayer, that is based on a {@link ILayerConfiguration}
+ */
+public interface ILayer {
 
+  /**
+   * Get the underlying configuration for this ILayer
+   * @return configuration
+   */
+  ILayerConfiguration getLayerConfiguration();
 
+  /**
+   * Set the underlying layer configuration
+   * @param conf The new configuration
+   */
+  void setLayerConfiguration(ILayerConfiguration conf);
 
+  /**
+   * An implementation should provide a method to validate the network
+   * @return true if no errors found; false otherwise
+   */
+  boolean isValid();
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FFLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayerConfiguration.java
similarity index 56%
rename from cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FFLayer.java
rename to cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayerConfiguration.java
index d903e9002..e0f5d856b 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FFLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/ILayerConfiguration.java
@@ -19,34 +19,45 @@
  *
  */
 
-package net.brutex.ai.dnn.conf.layer;
-
-import lombok.extern.slf4j.Slf4j;
-import net.brutex.ai.dnn.api.Layer;
-import net.brutex.ai.dnn.api.NeuralNetwork;
-import net.brutex.ai.dnn.conf.layer.AbstractLayerConfiguration;
-import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.inputs.InputType.Type;
-
-@Slf4j
-public class FFLayer extends AbstractLayerConfiguration {
+package net.brutex.ai.dnn.api;
 
+public interface ILayerConfiguration {
 
   /**
-   * Create and return an instance of a LayerConfiguration.
+   * Create and return an instance of a ILayerConfiguration.
    *
    * @param network the "holding" network for the instance
    * @return the new layer instance
    */
-  @Override
-  public Layer instantiate(NeuralNetwork network) {
-    //Let's do some verifications first
-    if(getInputType() != Type.FF) {
-      log.error("The {} layer configuration must use an InputType of {}, but found {}",
-          this.getClass().getSimpleName(),
-          Type.FF.name(),
-          getInputType().name());
-    }
-    return null;
-  }
+  ILayer instantiate(IModel network);
+
+
+  /**
+   * Defines the valid input type for this ILayer
+   *
+   * @return InputType
+   */
+  org.deeplearning4j.nn.conf.inputs.InputType.Type getInputType();
+
+
+  /**
+   * Defines the valid input type for this ILayer
+   *
+   * @return InputType
+   */
+  org.deeplearning4j.nn.conf.inputs.InputType.Type getOutputType();
+
+
+  /**
+   * Number of trainable parameter in this layer
+   * @return number of parameter
+   */
+  long numParameters();
+
+  /**
+   * An implementation should provide a method to validate the network
+   * @return true if no errors found; false otherwise
+   */
+  boolean isValid();
+
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/IModel.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/IModel.java
new file mode 100644
index 000000000..f0c6a722a
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/IModel.java
@@ -0,0 +1,86 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.api;
+
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.dataset.api.DataSet;
+import org.nd4j.linalg.dataset.api.MultiDataSet;
+
+/**
+ * A Neural Network is an instance of a {@link INeuralNetworkConfiguration}, that can be trained,
+ * evaluated, saved, exported, etc. Its configuration state is defined with the
+ * {@link #setConfiguration(INeuralNetworkConfiguration)} and {@link #getConfiguration()} methods.
+ *
+ */
+public interface IModel {
+
+  /**
+   * The configuration that defines this Neural Network
+   *
+   * @param conf the configuration to use for this network
+   */
+  void setConfiguration(INeuralNetworkConfiguration conf);
+  INeuralNetworkConfiguration getConfiguration();
+
+  /**
+   * Fit the model for one iteration on the provided data
+   *
+   * @param features   the examples to classify (one example in each row)
+   * @param labels the example labels(a binary outcome matrix)
+   * @param featuresMask The mask array for the features (used for variable length time series, etc). May be null.
+   * @param labelsMask The mask array for the labels (used for variable length time series, etc). May be null.
+   */
+  void fit(INDArray features, INDArray labels, INDArray featuresMask, INDArray labelsMask);
+
+  /**
+   * This method fits model with a given DataSet
+   *
+   * @param dataSet the dataset to use for training
+   */
+  void fit(DataSet dataSet);
+
+  /**
+   * This method fits model with a given MultiDataSet
+   *
+   * @param dataSet the multi dataset to use for training
+   */
+  void fit(MultiDataSet dataSet);
+
+  /**
+   * The name of the Neural Network
+   * @return the name
+   */
+  String getName();
+
+  /**
+   * Set the name for this Neural Network
+   * @param name the name
+   */
+  void setName(String name);
+
+  /**
+   * An implementation should provide a method to validate the network
+   * @return true if no errors found; false otherwise
+   */
+  boolean isValid();
+
+}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/NeuralNetwork.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetwork.java
similarity index 58%
rename from cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/NeuralNetwork.java
rename to cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetwork.java
index c9437b838..48d6c561b 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/NeuralNetwork.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetwork.java
@@ -1,25 +1,27 @@
 /*
- *  ******************************************************************************
- *  *
- *  *
- *  * This program and the accompanying materials are made available under the
- *  * terms of the Apache License, Version 2.0 which is available at
- *  * https://www.apache.org/licenses/LICENSE-2.0.
- *  *
- *  *  See the NOTICE file distributed with this work for additional
- *  *  information regarding copyright ownership.
- *  * Unless required by applicable law or agreed to in writing, software
- *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- *  * License for the specific language governing permissions and limitations
- *  * under the License.
- *  *
- *  * SPDX-License-Identifier: Apache-2.0
- *  *****************************************************************************
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
  */
 
-package org.deeplearning4j.nn.api;
+package net.brutex.ai.dnn.api;
 
+import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
 import org.deeplearning4j.optimize.api.ConvexOptimizer;
 import org.nd4j.evaluation.IEvaluation;
 import org.nd4j.linalg.api.ndarray.INDArray;
@@ -31,7 +33,7 @@ import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
 /**
  * @author raver119
  */
-public interface NeuralNetwork {
+public interface INeuralNetwork {
 
   /**
    * This method does initialization of model
@@ -104,4 +106,17 @@ public interface NeuralNetwork {
    * @param iterator
    */
   <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T... evaluations);
+
+  /**
+   * A neural network is created from a configuration.
+   * @param conf the configuration to create the network from
+   */
+  void setConfiguration(NeuralNetworkConfiguration conf);
+
+  /**
+   *  Return the configuration for this configuration
+   * @return
+   */
+  NeuralNetworkConfiguration getConfiguration();
+
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetworkConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetworkConfiguration.java
new file mode 100644
index 000000000..81d447fa3
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/api/INeuralNetworkConfiguration.java
@@ -0,0 +1,52 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.api;
+
+import java.util.List;
+
+public interface INeuralNetworkConfiguration {
+
+}
+/**
+  /**
+   * Provides a flat list of all embedded layer configurations, this
+   * can only be called after the layer is initialized or {@link #getLayerConfigurations()} is
+   * called.
+   *
+   * @return unstacked layer configurations
+
+  List<ILayerConfiguration> getLayerConfigurations();
+
+
+  /**
+   * This uncollables any stacked layer configurations within building blocks like
+   * @link BuildingBlockLayer}
+
+  void calculateInnerLayerConfigurations();
+
+  /**
+   * An implementation should provide a method to validate the network
+   * @return true if no errors found; false otherwise
+
+  boolean isValid();
+}
+**/
\ No newline at end of file
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/NeuralNetworkConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/NeuralNetworkConfiguration.java
index e383ea9c7..51de9f873 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/NeuralNetworkConfiguration.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/NeuralNetworkConfiguration.java
@@ -22,32 +22,61 @@
 package net.brutex.ai.dnn.conf;
 
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.exc.InvalidTypeIdException;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 import lombok.Getter;
 import lombok.NonNull;
 import lombok.Setter;
 import lombok.Singular;
 import lombok.extern.jackson.Jacksonized;
 import lombok.extern.slf4j.Slf4j;
-import net.brutex.ai.dnn.api.LayerConfiguration;
+import net.brutex.ai.dnn.api.ILayerConfiguration;
+import net.brutex.ai.dnn.api.INeuralNetworkConfiguration;
 import org.deeplearning4j.nn.conf.BackpropType;
 import org.deeplearning4j.nn.conf.CacheMode;
+import org.deeplearning4j.nn.conf.GradientNormalization;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
+import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 import org.deeplearning4j.nn.conf.WorkspaceMode;
+import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.wrapper.BuildingBlockLayer;
+import org.deeplearning4j.nn.conf.layers.BaseLayer;
+import org.deeplearning4j.nn.conf.layers.BaseOutputLayer;
+import org.deeplearning4j.nn.conf.layers.Layer;
+import org.deeplearning4j.nn.conf.memory.LayerMemoryReport;
+import org.deeplearning4j.nn.conf.memory.MemoryReport;
+import org.deeplearning4j.nn.conf.memory.NetworkMemoryReport;
+import org.deeplearning4j.nn.conf.serde.JsonMappers;
+import org.deeplearning4j.nn.weights.IWeightInit;
+import org.deeplearning4j.nn.weights.WeightInit;
+import org.nd4j.linalg.activations.Activation;
+import org.nd4j.linalg.activations.IActivation;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.learning.config.IUpdater;
+import org.nd4j.linalg.lossfunctions.LossFunctions;
+import org.nd4j.linalg.lossfunctions.impl.LossBinaryXENT;
+import org.nd4j.linalg.lossfunctions.impl.LossMCXENT;
+import org.nd4j.linalg.lossfunctions.impl.LossMSE;
+import org.nd4j.linalg.lossfunctions.impl.LossNegativeLogLikelihood;
 
 /**
- * The NeuralNetworkConfiguration is a sequential container for the different layers in your
+ * The INeuralNetworkConfiguration is a sequential container for the different layers in your
  * network (or other NeuralNetworkConfigurations). That said, NeuralNetworkConfigurations can be
  * stacked.<br/><br/>
- * It then “chains” outputs to inputs sequentially for each NeuralNetworkConfiguration,
+ * It then “chains” outputs to inputs sequentially for each INeuralNetworkConfiguration,
  * finally returning the output of the "top" configuration. Any settings made, are inherited and can
- * be overridden on a "deeper" level. For this use case, you need to wrap the NeuralNetworkConfiguration
+ * be overridden on a "deeper" level. For this use case, you need to wrap the INeuralNetworkConfiguration
  * into a BuildingBlockLayer
  *
  */
@@ -55,77 +84,54 @@ import org.deeplearning4j.nn.conf.layers.wrapper.BuildingBlockLayer;
 @JsonIgnoreProperties(ignoreUnknown = true)
 @lombok.Builder
 @Slf4j
-public class NeuralNetworkConfiguration implements net.brutex.ai.dnn.api.NeuralNetworkConfiguration, Serializable, Cloneable {
-
-  /**
-   * The default {@link CacheMode} for this configuration. Will be set to "NONE" if not specified otherwise.
-   * Valid values are<br/>
-   * CacheMode.NONE,<br/>
-   * CacheMode.HOST or<br/>
-   * CacheMode.DEVICE<br/>
-   */
-  @NonNull
-  @lombok.Builder.Default private CacheMode cacheMode = CacheMode.NONE;
-
-  @Getter @Setter @NonNull
-  protected WorkspaceMode trainingWorkspaceMode = WorkspaceMode.ENABLED;
-
-  @Getter @Setter @NonNull
-  protected WorkspaceMode inferenceWorkspaceMode = WorkspaceMode.ENABLED;
-
-  @Getter @Setter @NonNull
-  protected BackpropType backpropType = BackpropType.Standard;
-
-  @Getter
-  protected Map<Integer, InputPreProcessor> inputPreProcessors = new HashMap<>();
-
-
-  @Getter @Setter protected int tbpttFwdLength = 20;
-  @Getter @Setter protected int tbpttBackLength = 20;
-
-
-  /**
-   * The list of layer configurations in this configuration. They will be indexed automatically
-   * as the layers get added starting with index 0.
-   */
-  @Singular @Getter
-  private List<LayerConfiguration> layerConfigurations;
-
-  /**
-   * The name for this configuration. Defaults to "Anonymous NeuralNetworkConfiguration" if
-   * it is not specified.
-   */
-  @lombok.Builder.Default @Getter
-  private String name = "Anonymous NeuralNetworkConfiguration";
-
-
-  /**
-   * The {@link InputType} of the data for this network configuration
-   */
-  private InputType inputType;
+public class NeuralNetworkConfiguration extends NeuralNetConfiguration implements
+    INeuralNetworkConfiguration, Serializable, Cloneable {
 
+    private static final int DEFAULT_TBPTT_LENGTH = 20;
+  @Getter protected final List<NeuralNetworkConfiguration> confs = new ArrayList<>();
   /**
    * hidden list of layers, that "flattens" all the layers of this network and applies
    * inheritance.
    */
   @lombok.Builder.ObtainVia(method = "calculateInnerLayers")
-  private final List<LayerConfiguration> innerLayerConfigurations;
-
-  @Override
-  public void calculateInnerLayerConfigurations()  {
-    List<LayerConfiguration> list = new ArrayList<>();
-    for( LayerConfiguration layer : this.layerConfigurations) {
-      if(layer instanceof BuildingBlockLayer) {
-        BuildingBlockLayer blayer = (BuildingBlockLayer) layer;
-        blayer.getConf().calculateInnerLayerConfigurations();
-        list.addAll(blayer.getConf().getLayerConfigurations());
-      } else {
-        list.add(layer);
-      }
-    }
-    this.layerConfigurations = list;
-  }
-
+  private final List<ILayerConfiguration> innerLayerConfigurations;
+  @Getter @Setter @NonNull @Singular
+  protected List<Layer> layers = new ArrayList<>();
+  @Getter @Setter @NonNull @lombok.Builder.Default @Deprecated
+  protected WorkspaceMode trainingWorkspaceMode = WorkspaceMode.ENABLED;
+  @Getter @Setter @NonNull @lombok.Builder.Default @Deprecated
+  protected WorkspaceMode inferenceWorkspaceMode = WorkspaceMode.ENABLED;
+  /**
+   * The type of backprop. Default setting is used for most networks (MLP, CNN etc), but
+   * optionally truncated BPTT can be used for training recurrent neural networks. If using
+   * TruncatedBPTT make sure you set both tBPTTForwardLength() and tBPTTBackwardLength()
+   */
+  @Getter @Setter @NonNull @lombok.Builder.Default
+  protected BackpropType backpropType = BackpropType.Standard;
+  @Getter
+  protected Map<Integer, InputPreProcessor> inputPreProcessors = new HashMap<>();
+  /**
+   * When doing truncated BPTT: how many steps of forward pass should we do before doing
+   * (truncated) backprop?<br> Only applicable when doing
+   * backpropType(BackpropType.TruncatedBPTT)<br> Typically tBPTTForwardLength parameter is same
+   * as the tBPTTBackwardLength parameter, but may be larger than it in some circumstances (but
+   * never smaller)<br> Ideally your training data time series length should be divisible by this
+   * This is the k1 parameter on pg23 of
+   * <a
+   * href="http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf">http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf</a>
+   *
+   * @param forwardLength Forward length > 0, >= backwardLength
+   */
+  @Getter @Setter protected int tbpttFwdLength = 20;
+  /**
+   * When doing truncated BPTT: how many steps of backward should we do?<br> Only applicable when
+   * doing backpropType(BackpropType.TruncatedBPTT)<br> This is the k2 parameter on pg23 of
+   * <a
+   * href="http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf">http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf</a>
+   *
+   * @param backwardLength <= forwardLength
+   */
+  @Getter @Setter protected int tbpttBackLength = 20;
   /**
    * Creates and returns a copy of this object.
    *
@@ -136,8 +142,564 @@ public class NeuralNetworkConfiguration implements net.brutex.ai.dnn.api.NeuralN
    *                                    cannot be cloned.
    * @see Cloneable
    */
-  @Override
-  protected Object clone() throws CloneNotSupportedException {
-    return super.clone();
+
+  //Nd4j.getRandom().setSeed(getConf(0).getSeed()); //TODO
+  //Counter for the number of parameter updates so far
+  // This is important for learning rate schedules, for example, and is stored here to ensure it is persisted
+  // for Spark and model serialization
+  @Getter @Setter
+  protected int iterationCount = 0;
+  //Counter for the number of epochs completed so far. Used for per-epoch schedules
+  @Getter @Setter
+  protected int epochCount = 0;
+  protected double dampingFactor = 100;
+  @Getter @Setter //todo why?
+  private Layer layer;
+  /**
+   * A seed for this network, will be random if not specified.
+    */
+  @Getter @Setter @NonNull @lombok.Builder.Default
+  private long seed = new Random().nextLong();
+  /**
+   * The default {@link CacheMode} for this configuration. Will be set to "NONE" if not specified otherwise.
+   * This method defines how/if preOutput cache is handled: NONE: cache disabled (default value)
+   * HOST: Host memory will be used DEVICE: GPU memory will be used (on CPU backends effect will
+   * be the same as for HOST)
+   *
+   * Valid values are<br/>
+   * CacheMode.NONE,<br/>
+   * CacheMode.HOST or<br/>
+   * CacheMode.DEVICE<br/>
+   * @param cacheMode
+   */
+  @NonNull @Getter @Setter
+  @lombok.Builder.Default private CacheMode cacheMode = CacheMode.NONE;
+  /**
+   * The list of layer configurations in this configuration. They will be indexed automatically
+   * as the layers get added starting with index 0.
+   */
+  @Singular @Getter
+  private List<ILayerConfiguration> layerConfigurations;
+  /**
+   * The name for this configuration. Defaults to "Anonymous INeuralNetworkConfiguration" if
+   * it is not specified.
+   */
+  @lombok.Builder.Default @Getter
+  private String name = "Anonymous INeuralNetworkConfiguration";
+  /**
+   * The {@link InputType} of the data for this network configuration
+   */
+  private InputType inputType;
+  /**
+   * Set the DataType for the network parameters and activations for all layers in the network.
+   * Default: Float
+   *
+   * @param dataType Datatype to use for parameters and activations
+   */
+  @Getter @Setter @lombok.Builder.Default @NonNull
+  private DataType dataType = DataType.FLOAT;
+    /**
+     * Whether to override the nIn configuration forcibly upon construction. Default value is true.
+     * @return builder pattern
+     */
+    @Getter @Setter
+    @lombok.Builder.Default
+    private boolean overrideNinUponBuild = true;
+    /**
+     * Enabled by default. If enabled, the output layer configuration will be validated, to throw an
+     * exception on likely invalid outputs - such as softmax + nOut=1, or LossMCXENT + Tanh.<br> If
+     * disabled (false) no output layer validation will be performed.<br> Disabling this validation
+     * is not recommended, as the configurations that fail validation usually will not be able to
+     * learn correctly. However, the option to disable this validation is provided for advanced
+     * users when creating non-standard architectures.
+     *
+     * @param validate If true: validate output layer configuration. False: don't validate
+     */
+    @Getter @Setter @lombok.Builder.Default
+    private boolean validateOutputLayerConfig=true;
+    /**
+     * Enabled by default. If enabled, an exception will be throw when using the (invalid)
+     * combination of truncated backpropagation through time (TBPTT) with either a
+     * GlobalPoolingLayer or LastTimeStepLayer.<br> It is possible to disable this validation to
+     * allow what is almost certainly an invalid configuration to be used, however this is not
+     * recommended.
+     *
+     * @param validate Whether TBPTT validation should be performed
+     */
+    @Getter @Setter @lombok.Builder.Default
+    private boolean validateTbpttConfig=true;
+
+
+
+  /**
+   * Gradient updater configuration. For example, {@link org.nd4j.linalg.learning.config.Adam}
+   * or {@link org.nd4j.linalg.learning.config.Nesterovs}<br>
+   * Note: values set by this method will be applied to all applicable layers in the network, unless a different
+   * value is explicitly set on a given layer. In other words: values set via this method are used as the default
+   * value, and can be overridden on a per-layer basis.
+   *
+   * @param updater Updater to use
+   */
+  @Getter @Setter @NonNull
+  private IUpdater updater;
+
+  /**
+   * Gradient normalization strategy. Used to specify gradient renormalization, gradient clipping etc.
+   * See {@link GradientNormalization} for details<br>
+   * Note: values set by this method will be applied to all applicable layers in the network, unless a different
+   * value is explicitly set on a given layer. In other words: values set via this method are used as the default
+   * value, and can be overridden on a per-layer basis.
+   *
+   * @param gradientNormalization Type of normalization to use. Defaults to None.
+   * @see GradientNormalization
+   */
+  @Getter @Setter @NonNull @lombok.Builder.Default
+  private GradientNormalization gradientNormalization = GradientNormalization.None;
+
+  /**
+   * Threshold for gradient normalization, only used for GradientNormalization.ClipL2PerLayer,
+   * GradientNormalization.ClipL2PerParamType, and GradientNormalization.ClipElementWiseAbsoluteValue<br>
+   * Not used otherwise.<br>
+   * L2 threshold for first two types of clipping, or absolute value threshold for last type of clipping.<br>
+   * Note: values set by this method will be applied to all applicable layers in the network, unless a different
+   * value is explicitly set on a given layer. In other words: values set via this method are used as the default
+   * value, and can be overridden on a per-layer basis.
+   */
+  @Getter @Setter
+  private double gradientNormalizationThreshold;
+
+
+  /**
+   * Weight initialization scheme to use, for initial weight values
+   * Note: values set by this method will be applied to all applicable layers in the network, unless a different
+   * value is explicitly set on a given layer. In other words: values set via this method are used as the default
+   * value, and can be overridden on a per-layer basis.
+   */
+  @Getter @Setter
+  private IWeightInit weightInit;
+
+  /**
+   * Activation function / neuron non-linearity<br>
+   * Note: values set by this method will be applied to all applicable layers in the network, unless a different
+   * value is explicitly set on a given layer. In other words: values set via this method are used as the default
+   * value, and can be overridden on a per-layer basis.
+   */
+  @Getter @Setter
+  private IActivation activation;
+
+
+
+  /**
+   * Create a neural net configuration from json
+   *
+   * @param json the neural net configuration from json
+   * @return {@link NeuralNetworkConfiguration}
+   */
+  public static NeuralNetworkConfiguration fromJson(String json) {
+    NeuralNetworkConfiguration conf;
+    ObjectMapper mapper = NeuralNetworkConfiguration.mapper();
+    try {
+      conf = mapper.readValue(json, NeuralNetworkConfiguration.class);
+    } catch (InvalidTypeIdException e) {
+      if (e.getMessage().contains("@class")) {
+        try {
+          //JSON may be legacy (1.0.0-alpha or earlier), attempt to load it using old format
+          return JsonMappers.getLegacyMapper().readValue(json, NeuralNetworkConfiguration.class);
+        } catch (InvalidTypeIdException e2) {
+          //Check for legacy custom layers: "Could not resolve type id 'CustomLayer' as a subtype of [simple type, class org.deeplearning4j.nn.conf.layers.ILayer]: known type ids = [Bidirectional, CenterLossOutputLayer, CnnLossLayer, ..."
+          //1.0.0-beta5: dropping support for custom layers defined in pre-1.0.0-beta format. Built-in layers from these formats still work
+          String msg = e2.getMessage();
+          if (msg != null && msg.contains("Could not resolve type id")) {
+            throw new RuntimeException(
+                "Error deserializing MultiLayerConfiguration - configuration may have a custom " +
+                    "layer, vertex or preprocessor, in pre version 1.0.0-beta JSON format.\nModels in legacy format with custom"
+                    +
+                    " layers should be loaded in 1.0.0-beta to 1.0.0-beta4 and saved again, before loading in the current version of DL4J",
+                e);
+          }
+          throw new RuntimeException(e2);
+        } catch (IOException e2) {
+          throw new RuntimeException(e2);
+        }
+      }
+      throw new RuntimeException(e);
+    } catch (IOException e) {
+      //Check if this exception came from legacy deserializer...
+      String msg = e.getMessage();
+      if (msg != null && msg.contains("legacy")) {
+        throw new RuntimeException(
+            "Error deserializing MultiLayerConfiguration - configuration may have a custom " +
+                "layer, vertex or preprocessor, in pre version 1.0.0-alpha JSON format. These layers can be "
+                +
+                "deserialized by first registering them with NeuralNetConfiguration.registerLegacyCustomClassesForJSON(Class...)",
+            e);
+      }
+      throw new RuntimeException(e);
+    }
+
+    //To maintain backward compatibility after loss function refactoring (configs generated with v0.5.0 or earlier)
+    // Previously: enumeration used for loss functions. Now: use classes
+    // IN the past, could have only been an OutputLayer or RnnOutputLayer using these enums
+    int layerCount = 0;
+    JsonNode confs = null;
+    for (NeuralNetworkConfiguration nnc : conf.getConfs()) {
+      Layer l = nnc.getLayer();
+      if (l instanceof BaseOutputLayer && ((BaseOutputLayer) l).getLossFn() == null) {
+        //lossFn field null -> may be an old config format, with lossFunction field being for the enum
+        //if so, try walking the JSON graph to extract out the appropriate enum value
+
+        BaseOutputLayer ol = (BaseOutputLayer) l;
+        try {
+          JsonNode jsonNode = mapper.readTree(json);
+          if (confs == null) {
+            confs = jsonNode.get("confs");
+          }
+          if (confs instanceof ArrayNode) {
+            ArrayNode layerConfs = (ArrayNode) confs;
+            JsonNode outputLayerNNCNode = layerConfs.get(layerCount);
+            if (outputLayerNNCNode == null) {
+              throw new RuntimeException("should never happen"); //return conf; //Should never happen...
+            }
+            JsonNode outputLayerNode = outputLayerNNCNode.get("layer");
+
+            JsonNode lossFunctionNode = null;
+            if (outputLayerNode.has("output")) {
+              lossFunctionNode = outputLayerNode.get("output").get("lossFunction");
+            } else if (outputLayerNode.has("rnnoutput")) {
+              lossFunctionNode = outputLayerNode.get("rnnoutput").get("lossFunction");
+            }
+
+            if (lossFunctionNode != null) {
+              String lossFunctionEnumStr = lossFunctionNode.asText();
+              LossFunctions.LossFunction lossFunction = null;
+              try {
+                lossFunction = LossFunctions.LossFunction.valueOf(lossFunctionEnumStr);
+              } catch (Exception e) {
+                log.warn(
+                    "OutputLayer with null LossFunction or pre-0.6.0 loss function configuration detected: could not parse JSON",
+                    e);
+              }
+
+              if (lossFunction != null) {
+                switch (lossFunction) {
+                  case MSE:
+                    ol.setLossFn(new LossMSE());
+                    break;
+                  case XENT:
+                    ol.setLossFn(new LossBinaryXENT());
+                    break;
+                  case NEGATIVELOGLIKELIHOOD:
+                    ol.setLossFn(new LossNegativeLogLikelihood());
+                    break;
+                  case MCXENT:
+                    ol.setLossFn(new LossMCXENT());
+                    break;
+
+                  //Remaining: TODO
+                  case SQUARED_LOSS:
+                  case RECONSTRUCTION_CROSSENTROPY:
+                  default:
+                    log.warn(
+                        "OutputLayer with null LossFunction or pre-0.6.0 loss function configuration detected: could not set loss function for {}",
+                        lossFunction);
+                    break;
+                }
+              }
+            }
+
+          } else {
+            log.warn(
+                "OutputLayer with null LossFunction or pre-0.6.0 loss function configuration detected: could not parse JSON: layer 'confs' field is not an ArrayNode (is: {})",
+                (confs != null ? confs.getClass() : null));
+          }
+        } catch (IOException e) {
+          log.warn(
+              "OutputLayer with null LossFunction or pre-0.6.0 loss function configuration detected: could not parse JSON",
+              e);
+          break;
+        }
+      }
+
+      //Also, pre 0.7.2: activation functions were Strings ("activationFunction" field), not classes ("activationFn")
+      //Try to load the old format if necessary, and create the appropriate IActivation instance
+      if ((l instanceof BaseLayer) && ((BaseLayer) l).getActivationFn() == null) {
+        try {
+          JsonNode jsonNode = mapper.readTree(json);
+          if (confs == null) {
+            confs = jsonNode.get("confs");
+          }
+          if (confs instanceof ArrayNode) {
+            ArrayNode layerConfs = (ArrayNode) confs;
+            JsonNode outputLayerNNCNode = layerConfs.get(layerCount);
+            if (outputLayerNNCNode == null) {
+              throw new RuntimeException("Should never happen"); //return conf; //Should never happen...
+            }
+            JsonNode layerWrapperNode = outputLayerNNCNode.get("layer");
+
+            if (layerWrapperNode == null || layerWrapperNode.size() != 1) {
+              continue;
+            }
+
+            JsonNode layerNode = layerWrapperNode.elements().next();
+            JsonNode activationFunction = layerNode.get(
+                "activationFunction"); //Should only have 1 element: "dense", "output", etc
+
+            if (activationFunction != null) {
+              IActivation ia = Activation.fromString(activationFunction.asText())
+                  .getActivationFunction();
+              ((BaseLayer) l).setActivationFn(ia);
+            }
+          }
+
+        } catch (IOException e) {
+          log.warn(
+              "ILayer with null ActivationFn field or pre-0.7.2 activation function detected: could not parse JSON",
+              e);
+        }
+      }
+
+      if (!handleLegacyWeightInitFromJson(json, l, mapper, confs, layerCount)) {
+        return conf;
+      }
+
+      layerCount++;
+    }
+    return conf;
   }
+
+  /**
+   * Handle {@link WeightInit} and {@link Distribution} from legacy configs in Json format. Copied
+   * from handling of {@link Activation} above.
+   *
+   * @return True if all is well and layer iteration shall continue. False else-wise.
+   */
+  private static boolean handleLegacyWeightInitFromJson(String json, Layer l, ObjectMapper mapper,
+      JsonNode confs, int layerCount) {
+    if ((l instanceof BaseLayer) && ((BaseLayer) l).getWeightInitFn() == null) {
+      try {
+        JsonNode jsonNode = mapper.readTree(json);
+        if (confs == null) {
+          confs = jsonNode.get("confs");
+        }
+        if (confs instanceof ArrayNode) {
+          ArrayNode layerConfs = (ArrayNode) confs;
+          JsonNode outputLayerNNCNode = layerConfs.get(layerCount);
+          if (outputLayerNNCNode == null) {
+            return false; //Should never happen...
+          }
+          JsonNode layerWrapperNode = outputLayerNNCNode.get("layer");
+
+          if (layerWrapperNode == null || layerWrapperNode.size() != 1) {
+            return true;
+          }
+
+          JsonNode layerNode = layerWrapperNode.elements().next();
+          JsonNode weightInit = layerNode.get(
+              "weightInit"); //Should only have 1 element: "dense", "output", etc
+          JsonNode distribution = layerNode.get("dist");
+
+          Distribution dist = null;
+          if (distribution != null) {
+            dist = mapper.treeToValue(distribution, Distribution.class);
+          }
+
+          if (weightInit != null) {
+            final IWeightInit wi = WeightInit.valueOf(weightInit.asText())
+                .getWeightInitFunction(dist);
+            ((BaseLayer) l).setWeightInitFn(wi);
+          }
+        }
+
+      } catch (IOException e) {
+        log.warn(
+            "ILayer with null WeightInit detected: " + l.getLayerName() + ", could not parse JSON",
+            e);
+      }
+    }
+    return true;
+
+  }
+
+  /**
+   * Object mapper for serialization of configurations
+   *
+   * @return
+   */
+  public static ObjectMapper mapperYaml() {
+    return JsonMappers.getMapperYaml();
+  }
+
+  /**
+   * Object mapper for serialization of configurations
+   *
+   * @return
+   */
+  public static ObjectMapper mapper() {
+    return JsonMappers.getMapper();
+  }
+
+
+
+  /**
+   * @return JSON representation of NN configuration
+   */
+  public String toYaml() {
+    ObjectMapper mapper = NeuralNetConfiguration.mapperYaml();
+    synchronized (mapper) {
+      try {
+        return mapper.writeValueAsString(this);
+      } catch (com.fasterxml.jackson.core.JsonProcessingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * @return JSON representation of NN configuration
+   */
+  public String toJson() {
+    ObjectMapper mapper = NeuralNetConfiguration.mapper();
+    synchronized (mapper) {
+      //JSON mappers are supposed to be thread safe: however, in practice they seem to miss fields occasionally
+      //when writeValueAsString is used by multiple threads. This results in invalid JSON. See issue #3243
+      try {
+        return mapper.writeValueAsString(this);
+      } catch (com.fasterxml.jackson.core.JsonProcessingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    return toJson();
+  }
+
+  public NeuralNetworkConfiguration getConf(int i) {
+    return confs.get(i);
+  }
+
+  @Override
+  public NeuralNetworkConfiguration clone() {
+
+      NeuralNetworkConfiguration clone = (NeuralNetworkConfiguration) super.clone();
+      List<NeuralNetworkConfiguration> confList = clone.getConfs();
+      if (confList != null) {
+        List<NeuralNetworkConfiguration> list = new ArrayList<>();
+        for (NeuralNetworkConfiguration conf : confList) {
+          list.add(conf.clone());
+        }
+      }
+
+      if (clone.getInputPreProcessors() != null) {
+        Map<Integer, InputPreProcessor> map = new HashMap<>();
+        for (Map.Entry<Integer, InputPreProcessor> entry : clone.getInputPreProcessors().entrySet()) {
+          map.put(entry.getKey(), entry.getValue().clone());
+        }
+        clone.getInputPreProcessors().clear();
+        clone.getInputPreProcessors().putAll(map);
+      }
+
+      clone.setInferenceWorkspaceMode(this.inferenceWorkspaceMode);
+      clone.setTrainingWorkspaceMode(this.trainingWorkspaceMode);
+      clone.setCacheMode(this.cacheMode);
+      clone.setValidateOutputLayerConfig(this.validateOutputLayerConfig);
+      clone.setDataType(this.dataType);
+
+      return clone;
+
+  }
+
+  public InputPreProcessor getInputPreProcess(int curr) {
+    return inputPreProcessors.get(curr);
+  }
+
+  /**
+   * Get a {@link MemoryReport} for the given MultiLayerConfiguration. This is used to estimate the
+   * memory requirements for the given network configuration and input
+   *
+   * @param inputType Input types for the network
+   * @return Memory report for the network
+   */
+  public NetworkMemoryReport getMemoryReport(InputType inputType) {
+
+    Map<String, MemoryReport> memoryReportMap = new LinkedHashMap<>();
+    int nLayers = confs.size();
+    for (int i = 0; i < nLayers; i++) {
+      String layerName = confs.get(i).getLayer().getLayerName();
+      if (layerName == null) {
+        layerName = String.valueOf(i);
+      }
+
+      //Pass input type through preprocessor, if necessary
+      InputPreProcessor preproc = getInputPreProcess(i);
+      //TODO memory requirements for preprocessor
+      if (preproc != null) {
+        inputType = preproc.getOutputType(inputType);
+      }
+
+      LayerMemoryReport report = confs.get(i).getLayer().getMemoryReport(inputType);
+      memoryReportMap.put(layerName, report);
+
+      inputType = confs.get(i).getLayer().getOutputType(i, inputType);
+    }
+
+    return new NetworkMemoryReport(memoryReportMap, MultiLayerConfiguration.class,
+        "MultiLayerNetwork", inputType);
+  }
+
+  /**
+   * For the given input shape/type for the network, return a list of activation sizes for each
+   * layer in the network.<br> i.e., list.get(i) is the output activation sizes for layer i
+   *
+   * @param inputType Input type for the network
+   * @return A lits of activation types for the network, indexed by layer number
+   */
+  public List<InputType> getLayerActivationTypes(@NonNull InputType inputType) {
+    List<InputType> out = new ArrayList<>();
+    int nLayers = confs.size();
+    for (int i = 0; i < nLayers; i++) {
+      InputPreProcessor preproc = getInputPreProcess(i);
+      if (preproc != null) {
+        inputType = preproc.getOutputType(inputType);
+      }
+
+      inputType = confs.get(i).getLayer().getOutputType(i, inputType);
+      out.add(inputType);
+    }
+    return out;
+  }
+
+  /**
+   * Defines some additional handy methods. Other than that,
+   * the builder is generated by lombok.
+   */
+  public static class NeuralNetworkConfigurationBuilder {
+
+      /**
+       * Specify the processors. These are used at each layer for doing things like normalization and
+       * shaping of input.
+       *
+       * @param processor what to use to preProcess the data.
+       * @return builder pattern
+       */
+      public NeuralNetworkConfigurationBuilder inputPreProcessor(Integer layer,
+          InputPreProcessor processor) {
+        inputPreProcessors.put(layer, processor);
+        return this;
+      }
+
+    /**
+     * Specify additional layer configurations
+     */
+    @Deprecated
+    public NeuralNetworkConfigurationBuilder layersFromArray(Layer[] arrLayers) {
+      for(Layer l : arrLayers) {
+        layers.add( l );
+      }
+      return this;
+    }
+  }
+
+
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/AbstractLayerConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/AbstractLayerConfiguration.java
index 951688e51..1ed923bda 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/AbstractLayerConfiguration.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/AbstractLayerConfiguration.java
@@ -24,12 +24,12 @@ package net.brutex.ai.dnn.conf.layer;
 import lombok.Getter;
 import lombok.NonNull;
 import lombok.Setter;
-import net.brutex.ai.dnn.api.LayerConfiguration;
-import org.deeplearning4j.nn.conf.inputs.InputType;
+import lombok.experimental.SuperBuilder;
+import net.brutex.ai.dnn.api.ILayerConfiguration;
 
-public abstract class AbstractLayerConfiguration implements LayerConfiguration {
+@SuperBuilder
+public abstract class AbstractLayerConfiguration implements ILayerConfiguration {
 
   @Getter @Setter @NonNull
-  private InputType.Type inputType;
-
+  private String name;
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/DenseLayerConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/DenseLayerConfiguration.java
new file mode 100644
index 000000000..d472d99b2
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/DenseLayerConfiguration.java
@@ -0,0 +1,62 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.conf.layer;
+
+import lombok.Builder;
+import lombok.experimental.SuperBuilder;
+import org.deeplearning4j.nn.conf.layers.LayerValidation;
+
+/**
+ * The dense layer is a neural network layer that is connected deeply, which means each neuron in
+ * the dense layer receives input from all neurons of its previous layer. The dense layer is found
+ * to be the most commonly used layer in the models.
+ * <p>
+ * In the background, the dense layer performs a matrix-vector multiplication. The values used in
+ * the matrix are actually parameters that can be trained and updated with the help of
+ * backpropagation.
+ * <p>
+ * The output generated by the dense layer is an ‘m’ dimensional vector. Thus, dense layer is
+ * basically used for changing the dimensions of the vector. Dense layers also applies operations
+ * like rotation, scaling, translation on the vector.
+ */
+@SuperBuilder
+public class DenseLayerConfiguration extends FeedForwardLayerConfiguration {
+
+  /**
+   * Decides whether we should include a bias vector for calculation purposes or not.
+   */
+  @Builder.Default
+  boolean bias = true;
+
+  
+
+  /**
+   * An implementation to validate the network
+   *
+   * @return true if no errors found; false otherwise
+   */
+  @Override
+  public boolean isValid() {
+    LayerValidation.assertNInNOutSet("DenseLayerConfiguration", getName(), -99, getIn(), getOut());
+    return super.isValid();
+  }
+}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FeedForwardLayerConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FeedForwardLayerConfiguration.java
new file mode 100644
index 000000000..c86869d54
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/conf/layer/FeedForwardLayerConfiguration.java
@@ -0,0 +1,99 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.conf.layer;
+
+import lombok.Getter;
+import lombok.experimental.SuperBuilder;
+import lombok.extern.slf4j.Slf4j;
+import net.brutex.ai.dnn.api.ILayer;
+import net.brutex.ai.dnn.api.ILayerConfiguration;
+import net.brutex.ai.dnn.api.IModel;
+import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.conf.inputs.InputType.Type;
+
+/**
+ * A Feed Forward Layer Configuration
+ */
+@Slf4j
+@SuperBuilder
+public class FeedForwardLayerConfiguration extends AbstractLayerConfiguration implements ILayerConfiguration {
+
+  @Getter private int in;
+  @Getter private int out;
+
+  /**
+   * This Fast Forward ILayer will always output data as
+   * FF type.
+   * @return InputType for FF
+   **/
+  @Getter
+  final InputType.Type outputType = InputType.Type.FF;
+
+  @Getter
+  final InputType.Type inputType = InputType.Type.FF;
+
+  /**
+   * Create and return an instance of a ILayerConfiguration.
+   *
+   * @param network the "holding" network for the instance
+   * @return the new layer instance
+   */
+  //@Override
+  public ILayer instantiate(IModel network) {
+    //Let's do some verifications first
+    if(getInputType() != Type.FF) {
+      log.error("The {} layer configuration must use an InputType of {}, but found {}",
+          this.getClass().getSimpleName(),
+          Type.FF.name(),
+          getInputType().name());
+    }
+    return null;
+  }
+
+  /**
+   * Number of trainable parameter in this layer
+   *
+   * @return number of parameter
+   */
+  @Override
+  public long numParameters() {
+    return in * out + out; //add one extra out for the bias
+  }
+
+  /**
+   * An implementation should provide a method to validate the network
+   *
+   * @return true if no errors found; false otherwise
+   */
+  @Override
+  public boolean isValid() {
+    boolean result = true;
+    if(getInputType() != Type.FF) {
+      log.error("The {} layer configuration must use an InputType of {}, but found {}",
+          this.getClass().getSimpleName(),
+          Type.FF.name(),
+          getInputType().name());
+      result = false;
+    }
+    return result;
+  }
+}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/AbstractNeuralNetwork.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/AbstractNeuralNetwork.java
deleted file mode 100644
index a1c36e988..000000000
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/AbstractNeuralNetwork.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *
- *    ******************************************************************************
- *    *
- *    * This program and the accompanying materials are made available under the
- *    * terms of the Apache License, Version 2.0 which is available at
- *    * https://www.apache.org/licenses/LICENSE-2.0.
- *    *
- *    *  See the NOTICE file distributed with this work for additional
- *    *  information regarding copyright ownership.
- *    * Unless required by applicable law or agreed to in writing, software
- *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- *    * License for the specific language governing permissions and limitations
- *    * under the License.
- *    *
- *    * SPDX-License-Identifier: Apache-2.0
- *    *****************************************************************************
- *
- */
-
-package net.brutex.ai.dnn.impl.network;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import lombok.Getter;
-import lombok.NonNull;
-import lombok.Setter;
-import net.brutex.ai.dnn.api.Layer;
-import net.brutex.ai.dnn.api.NeuralNetwork;
-import net.brutex.ai.dnn.api.LayerConfiguration;
-import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
-import org.deeplearning4j.optimize.api.TrainingListener;
-import org.nd4j.linalg.dataset.api.MultiDataSet;
-
-public abstract class AbstractNeuralNetwork implements NeuralNetwork {
-
-  @Getter @Setter @NonNull
-  private String name;
-
-  @Getter @NonNull
-  private NeuralNetworkConfiguration configuration;
-
-  @Getter
-  private final Collection<TrainingListener> trainingListeners = new HashSet<>();
-
-  /**
-   * The neural network holds an instantiation of its configured
-   * layers.
-   * @return the actual runtime layers
-   */
-  @Getter
-  private final List<Layer> runtimeLayers = new ArrayList<>();
-
-  /**
-   * Sets the configuration to be used. Each time a configuration is set, the runtime layers
-   * of this NeuralNetwork are updated from the configuration.
-   *
-   * @param conf the configuration to use for this network
-   */
-  public void setConfiguration(net.brutex.ai.dnn.api.NeuralNetworkConfiguration conf) {
-    List<LayerConfiguration> layers = conf.getLayerConfigurations();
-    for(LayerConfiguration layer : layers) {
-      Layer initializedLayer = layer.instantiate(this);
-      this.getRuntimeLayers().add(initializedLayer);
-    }
-    this.configuration = configuration;
-  }
-
-}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/NeuralNetwork.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/NeuralNetwork.java
deleted file mode 100644
index 198007baf..000000000
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/impl/network/NeuralNetwork.java
+++ /dev/null
@@ -1,692 +0,0 @@
-/*
- *
- *    ******************************************************************************
- *    *
- *    * This program and the accompanying materials are made available under the
- *    * terms of the Apache License, Version 2.0 which is available at
- *    * https://www.apache.org/licenses/LICENSE-2.0.
- *    *
- *    *  See the NOTICE file distributed with this work for additional
- *    *  information regarding copyright ownership.
- *    * Unless required by applicable law or agreed to in writing, software
- *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- *    * License for the specific language governing permissions and limitations
- *    * under the License.
- *    *
- *    * SPDX-License-Identifier: Apache-2.0
- *    *****************************************************************************
- *
- */
-
-package net.brutex.ai.dnn.impl.network;
-
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import lombok.Getter;
-import lombok.NonNull;
-import lombok.Setter;
-import lombok.extern.slf4j.Slf4j;
-import lombok.val;
-import org.bytedeco.javacpp.Pointer;
-import org.deeplearning4j.datasets.iterator.MultiDataSetWrapperIterator;
-import org.deeplearning4j.exception.DL4JInvalidInputException;
-import org.deeplearning4j.nn.api.Classifier;
-import org.deeplearning4j.nn.api.Layer;
-import org.deeplearning4j.nn.api.MaskState;
-import org.deeplearning4j.nn.api.Updater;
-import org.deeplearning4j.nn.api.layers.IOutputLayer;
-import org.deeplearning4j.nn.api.layers.RecurrentLayer;
-import org.deeplearning4j.nn.conf.BackpropType;
-import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
-import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
-import org.deeplearning4j.nn.conf.WorkspaceMode;
-import org.deeplearning4j.nn.layers.FrozenLayerWithBackprop;
-import org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer;
-import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
-import org.deeplearning4j.nn.updater.UpdaterCreator;
-import org.deeplearning4j.nn.workspace.ArrayType;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
-import org.deeplearning4j.optimize.Solver;
-import org.deeplearning4j.optimize.api.ConvexOptimizer;
-import org.deeplearning4j.optimize.api.TrainingListener;
-import org.deeplearning4j.util.CrashReportingUtil;
-import org.deeplearning4j.util.ModelSerializer;
-import org.nd4j.common.base.Preconditions;
-import org.nd4j.common.primitives.Pair;
-import org.nd4j.linalg.api.memory.MemoryWorkspace;
-import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration;
-import org.nd4j.linalg.api.memory.enums.AllocationPolicy;
-import org.nd4j.linalg.api.memory.enums.LearningPolicy;
-import org.nd4j.linalg.api.memory.enums.ResetPolicy;
-import org.nd4j.linalg.api.memory.enums.SpillPolicy;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.AsyncDataSetIterator;
-import org.nd4j.linalg.dataset.DataSet;
-import org.nd4j.linalg.dataset.api.MultiDataSet;
-import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
-import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
-import org.nd4j.linalg.exception.ND4JArraySizeException;
-import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.heartbeat.Heartbeat;
-import org.nd4j.linalg.heartbeat.reports.Environment;
-import org.nd4j.linalg.heartbeat.reports.Event;
-import org.nd4j.linalg.heartbeat.reports.Task;
-import org.nd4j.linalg.heartbeat.utils.EnvironmentUtils;
-import org.nd4j.linalg.heartbeat.utils.TaskUtils;
-import org.nd4j.linalg.indexing.NDArrayIndex;
-
-@Slf4j
-public class NeuralNetwork extends AbstractNeuralNetwork {
-
-
-  //the hidden neural network layers (including output layer)
-  protected Layer[] layers;
-
-  protected transient ThreadLocal<Long> lastEtlTime = new ThreadLocal<>();
-
-  //Current training data: input features and labels
-  @Getter @Setter @NonNull
-  protected INDArray input;
-  @Getter @Setter
-  protected INDArray labels;
-
-  //Workspaces for CUDNN. Pass to LayerWorkspaceMgr for re-use in cudnn helpers
-  @Getter
-  protected transient Map<String, Pointer> helperWorkspaces = new HashMap<>();
-
-  /**
-   * Used to call optimizers during backprop
-   */
-  @NonNull
-  protected transient Solver solver = new Solver.Builder().configure(getConfiguration()).
-      listeners(getTrainingListeners()).model(this).build();
-
-
-  /**
-   * Create a new NeuralNetwork from the given configuration
-   * @param conf
-   */
-  public NeuralNetwork(NeuralNetworkConfiguration conf) {
-    if(! validateConfiguration() ) {
-      log.error("Configuration '{}' has failed validation.", conf.getName());
-      throw new RuntimeException();
-    }
-    log.info("Configuration '{}' has been validated successfully.", conf.getName());
-    this.conf = conf;
-  }
-
-  private boolean validateConfiguration() {
-
-    return true;
-  }
-
-  private void logNotImplemented( ) {
-    // getStackTrace() method return
-    // current method name at 0th index
-    String  method = new Throwable()
-        .getStackTrace()[1]
-        .getMethodName();
-    log.trace("Method '{}}' is not implemented for {}", method, this.getClass().getSimpleName());
-  }
-
-  /**
-   * This method does initialization of model
-   * <p>
-   * PLEASE NOTE: All implementations should track own state, to avoid double spending
-   */
-  @Override
-  public void init() {
-    logNotImplemented();
-  }
-
-  /**
-   * This method returns model parameters as single INDArray
-   *
-   * @return
-   */
-  @Override
-  public INDArray params() {
-    logNotImplemented();
-    return null;
-  }
-
-  /**
-   * This method returns updater state (if applicable), null otherwise
-   *
-   * @return
-   */
-  @Override
-  public INDArray updaterState() {
-    return getUpdater(true) != null ? getUpdater(true).getStateViewArray() : null;
-  }
-
-  /**
-   * This method returns Optimizer used for training
-   *
-   * @return the optimizer
-   */
-  @Override
-  public ConvexOptimizer getOptimizer() {
-    return solver.getOptimizer();
-  }
-
-
-
-  /** Get the updater for this NeuralNetwork from the Solver
-   * @return Updater for NeuralNetwork
-   */
-  private Updater getUpdater(boolean initializeIfReq) {
-    if (solver == null && initializeIfReq) {
-      synchronized(this){
-        if(solver == null) {    //May have been created while waiting for lock
-          solver = new Solver.Builder().configure(conf()).listeners(getTrainingListeners()).model(this).build();
-          solver.getOptimizer().setUpdater(UpdaterCreator.getUpdater(this));
-        }
-      }
-    }
-    if(solver != null) {
-      return solver.getOptimizer().getUpdater(initializeIfReq);
-    }
-    return null;
-  }
-
-  /**
-   * Set the updater for the NeuralNetwork in the Solver
-   * */
-  public void setUpdater(@NonNull Updater updater) {
-    solver.getOptimizer().setUpdater(updater);
-  }
-
-
-  @Override
-  public void fit(MultiDataSet dataSet) {
-    if (dataSet.getFeatures().length == 1 && dataSet.getLabels().length == 1) {
-      INDArray features = dataSet.getFeatures(0);
-      INDArray labels = dataSet.getLabels(0);
-      INDArray fMask = null;
-      INDArray lMask = null;
-
-      if (dataSet.getFeaturesMaskArrays() != null)
-        fMask = dataSet.getFeaturesMaskArrays()[0];
-
-      if (dataSet.getFeaturesMaskArrays() != null)
-        lMask = dataSet.getLabelsMaskArrays()[0];
-
-      DataSet ds = new DataSet(features, labels, fMask, lMask);
-      fit(ds);
-    } else {
-      throw new DL4JInvalidInputException(
-          "MultiLayerNetwork can't handle MultiDataSet with more than 1 features or labels array." +
-              "Please consider use of ComputationGraph");
-    }
-  }
-
-  /**
-   * Perform minibatch training on all minibatches in the MultiDataSetIterator, for the specified number of epochs.
-   * Equvalent to calling {@link #fit(MultiDataSetIterator)} numEpochs times in a loop
-   *
-   * @param iterator  Training data (DataSetIterator). Iterator must support resetting
-   * @param numEpochs Number of training epochs, >= 1
-   */
-  public void fit(@NonNull MultiDataSetIterator iterator, int numEpochs){
-    Preconditions.checkArgument(numEpochs > 0, "Number of epochs much be > 0. Got numEpochs = %s", numEpochs);
-    Preconditions.checkArgument(numEpochs == 1 || iterator.resetSupported(), "Cannot perform multiple epochs training using" +
-        "iterator has does not support resetting (iterator.resetSupported() returned false)");
-
-    for(int i = 0; i < numEpochs; i++) {
-      fit(iterator);
-    }
-  }
-
-  /**
-   * Perform minibatch training on all minibatches in the MultiDataSetIterator.<br>
-   * Note: The MultiDataSets in the MultiDataSetIterator must have exactly 1 input and output array (as
-   * MultiLayerNetwork only supports 1 input and 1 output)
-   *
-   * @param iterator  Training data (DataSetIterator). Iterator must support resetting
-   */
-  @Override
-  public void fit(MultiDataSetIterator iterator) {
-    fit(new MultiDataSetWrapperIterator(iterator));
-  }
-
-  /**
-   * Perform minibatch training on all minibatches in the DataSetIterator for 1 epoch.<br>
-   * Note that this method does not do layerwise  pretraining.<br>
-   * For pretraining use method pretrain..  #pretrain(DataSetIterator)<br>
-   * @param iterator Training data (DataSetIterator)
-   */
-  @Override
-  public void fit(DataSetIterator iterator) {
-    try{
-      fitHelper(iterator);
-    } catch (OutOfMemoryError e){
-      CrashReportingUtil.writeMemoryCrashDump(this, e);
-      throw e;
-    }
-  }
-
-  private synchronized void fitHelper(DataSetIterator iterator){
-    // we're wrapping all iterators into AsyncDataSetIterator to provide background prefetch - where appropriate
-    DataSetIterator iter;
-    boolean destructable = false;
-    if (iterator.asyncSupported()) {
-      iter = new AsyncDataSetIterator(iterator, Math.min(
-          Nd4j.getAffinityManager().getNumberOfDevices() * 2, 2), true);
-      destructable = true;
-    } else {
-      iter = iterator;
-    }
-
-    for (TrainingListener tl : trainingListeners) {
-      tl.onEpochStart(this);
-    }
-
-    LayerWorkspaceMgr workspaceMgr;
-    if(conf.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
-    } else {
-      workspaceMgr = LayerWorkspaceMgr.builder()
-          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-          //Note for updater working memory, we have the option to re-use WS_ALL_LAYERS_ACT or FF/BP_WORKING_MEM
-          // as these should be closed by the time updaters are executed
-          //Generally, WS_ALL_LAYERS_ACT will be the larger of the two, so we'll use this
-          .with(ArrayType.UPDATER_WORKING_MEM, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-          .build();
-    }
-    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
-
-    update(TaskUtils.buildTask(iter));
-    if (!iter.hasNext() && iter.resetSupported()) {
-      iter.reset();
-    }
-    long time1 = System.currentTimeMillis();
-    while (iter.hasNext()) {
-
-      DataSet next = iter.next();
-      long time2 = System.currentTimeMillis();
-
-      lastEtlTime.set((time2 - time1));
-
-      if (next.getFeatures() == null || next.getLabels() == null)
-        break;
-
-      // TODO: basically we want to wrap internals of this loop into workspace
-
-
-      boolean hasMaskArrays = next.hasMaskArrays();
-
-      if (conf.getBackpropType() == BackpropType.TruncatedBPTT) {
-        doTruncatedBPTT(next.getFeatures(), next.getLabels(), next.getFeaturesMaskArray(),
-            next.getLabelsMaskArray(), workspaceMgr);
-      } else {
-        if (hasMaskArrays)
-          setLayerMaskArrays(next.getFeaturesMaskArray(), next.getLabelsMaskArray());
-
-        setInput(next.getFeatures());
-        setLabels(next.getLabels());
-
-        if (solver == null) {
-          try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-            solver = new Solver.Builder().configure(conf()).listeners(getTrainingListeners()).model(this)
-                .build();
-          }
-        }
-
-        //TODO CACHE
-        solver.optimize(workspaceMgr);
-      }
-
-      if (hasMaskArrays)
-        clearLayerMaskArrays();
-
-      time1 = System.currentTimeMillis();
-      synchronizeIterEpochCounts();
-    }
-
-    if (!trainingListeners.isEmpty()) {
-      for (TrainingListener tl : trainingListeners) {
-        tl.onEpochEnd(this);
-      }
-    }
-
-    clearLayersStates();
-
-    if (destructable)
-      ((AsyncDataSetIterator) iter).shutdown();
-
-    incrementEpochCount();
-  }
-
-
-  /**
-   * Workspace for working memory for a single layer: forward pass and backward pass
-   * Note that this is opened/closed once per op (activate/backpropGradient call)
-   */
-  protected static final String WS_LAYER_WORKING_MEM = "WS_LAYER_WORKING_MEM";
-  /**
-   * Workspace for storing all layers' activations - used only to store activations (layer inputs) as part of backprop
-   * Not used for inference
-   */
-  protected static final String WS_ALL_LAYERS_ACT = "WS_ALL_LAYERS_ACT";
-  /**
-   * Next 2 workspaces: used for:
-   * (a) Inference: holds activations for one layer only
-   * (b) Backprop: holds activation gradients for one layer only
-   * In both cases, they are opened and closed on every second layer
-   */
-  protected static final String WS_LAYER_ACT_1 = "WS_LAYER_ACT_1";
-  protected static final String WS_LAYER_ACT_2 = "WS_LAYER_ACT_2";
-
-  /**
-   * Workspace for output methods that use OutputAdapter
-   */
-  protected static final String WS_OUTPUT_MEM = "WS_OUTPUT_MEM";
-
-  /**
-   * Workspace for working memory in RNNs - opened and closed once per RNN time step
-   */
-  protected static final String WS_RNN_LOOP_WORKING_MEM = "WS_RNN_LOOP_WORKING_MEM";
-
-
-  protected WorkspaceConfiguration WS_LAYER_WORKING_MEM_CONFIG;
-
-  protected static final WorkspaceConfiguration WS_ALL_LAYERS_ACT_CONFIG = WorkspaceConfiguration.builder()
-      .initialSize(0)
-      .overallocationLimit(0.05)
-      .policyLearning(LearningPolicy.FIRST_LOOP)
-      .policyReset(ResetPolicy.BLOCK_LEFT)
-      .policySpill(SpillPolicy.REALLOCATE)
-      .policyAllocation(AllocationPolicy.OVERALLOCATE)
-      .build();
-
-  protected WorkspaceConfiguration WS_LAYER_ACT_X_CONFIG;
-
-  protected static final WorkspaceConfiguration WS_RNN_LOOP_WORKING_MEM_CONFIG = WorkspaceConfiguration.builder()
-      .initialSize(0).overallocationLimit(0.05).policyReset(ResetPolicy.BLOCK_LEFT)
-      .policyAllocation(AllocationPolicy.OVERALLOCATE).policySpill(SpillPolicy.REALLOCATE)
-      .policyLearning(LearningPolicy.FIRST_LOOP).build();
-
-
-  boolean initDone;
-  protected void update(Task task) {
-    if (!initDone) {
-      initDone = true;
-      Heartbeat heartbeat = Heartbeat.getInstance();
-      task = ModelSerializer.taskByModel(this);
-      Environment env = EnvironmentUtils.buildEnvironment();
-      heartbeat.reportEvent(Event.STANDALONE, env, task);
-    }
-  }
-
-  protected void doTruncatedBPTT(INDArray input, INDArray labels, INDArray featuresMaskArray,
-      INDArray labelsMaskArray, LayerWorkspaceMgr workspaceMgr) {
-    if (input.rank() != 3 || labels.rank() != 3) {
-      log.warn("Cannot do truncated BPTT with non-3d inputs or labels. Expect input with shape [miniBatchSize,nIn,timeSeriesLength], got "
-          + Arrays.toString(input.shape()) + "\tand labels with shape "
-          + Arrays.toString(labels.shape()));
-      return;
-    }
-    if (input.size(2) != labels.size(2)) {
-      log.warn("Input and label time series have different lengths: {} input length, {} label length",
-          input.size(2), labels.size(2));
-      return;
-    }
-
-    int fwdLen = conf.getTbpttFwdLength();
-    update(TaskUtils.buildTask(input, labels));
-    val timeSeriesLength = input.size(2);
-    long nSubsets = timeSeriesLength / fwdLen;
-    if (timeSeriesLength % fwdLen != 0)
-      nSubsets++; //Example: 100 fwdLen with timeSeriesLength=120 -> want 2 subsets (1 of size 100, 1 of size 20)
-
-    rnnClearPreviousState();
-
-    for (int i = 0; i < nSubsets; i++) {
-      long startTimeIdx = (long) i * fwdLen;
-      long endTimeIdx = startTimeIdx + fwdLen;
-      if (endTimeIdx > timeSeriesLength)
-        endTimeIdx = timeSeriesLength;
-
-      if (startTimeIdx > Integer.MAX_VALUE || endTimeIdx > Integer.MAX_VALUE)
-        throw new ND4JArraySizeException();
-      INDArray[] subsets = getSubsetsForTbptt((int) startTimeIdx, (int) endTimeIdx, input, labels,
-          featuresMaskArray, labelsMaskArray);
-
-      setInput(subsets[0]);
-      setLabels(subsets[1]);
-      setLayerMaskArrays(subsets[2], subsets[3]);
-
-      if (solver == null) {
-        try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-          solver = new Solver.Builder().configure(conf()).listeners(getTrainingListeners()).model(this)
-              .build();
-        }
-      }
-      solver.optimize(workspaceMgr);
-
-      //Finally, update the state of the RNN layers:
-      updateRnnStateWithTBPTTState();
-    }
-
-    rnnClearPreviousState();
-    clearLayerMaskArrays();
-  }
-
-  private INDArray[] getSubsetsForTbptt(int startTimeIdx, int endTimeIdx, INDArray input, INDArray labels,
-      INDArray fMask, INDArray lMask ){
-    INDArray[] out = new INDArray[4];
-    out[0] = input.get(NDArrayIndex.all(), NDArrayIndex.all(),
-        NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-    out[1] = labels.get(NDArrayIndex.all(), NDArrayIndex.all(),
-        NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-
-    if (fMask != null) {
-      out[2] = fMask.get(NDArrayIndex.all(),
-          NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-    }
-    if (lMask != null) {
-      out[3] = lMask.get(NDArrayIndex.all(),
-          NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-    }
-
-    return out;
-  }
-
-  /**
-   * Intended for internal/developer use
-   */
-  public void updateRnnStateWithTBPTTState() {
-    Layer[] layers = conf.calculateInnerLayers().toArray(new Layer[]{});
-    for (int i = 0; i < layers.length; i++) {
-      if (layers[i] instanceof RecurrentLayer) {
-        RecurrentLayer l = ((RecurrentLayer) layers[i]);
-        l.rnnSetPreviousState(l.rnnGetTBPTTState());
-      } else if (layers[i] instanceof MultiLayerNetwork) {
-        ((MultiLayerNetwork) layers[i]).updateRnnStateWithTBPTTState();
-      }
-    }
-  }
-
-  /** Clear the previous state of the RNN layers (if any).
-   */
-  public void rnnClearPreviousState() {
-    Layer[] layers = conf.getLayers().toArray(new Layer[]{});
-    if (layers == null)
-      return;
-    for (int i = 0; i < layers.length; i++) {
-      if (layers[i] instanceof RecurrentLayer)
-        ((RecurrentLayer) layers[i]).rnnClearPreviousState();
-      else if (layers[i] instanceof MultiLayerNetwork) {
-        ((MultiLayerNetwork) layers[i]).rnnClearPreviousState();
-      } else if(layers[i] instanceof BaseWrapperLayer && ((BaseWrapperLayer)layers[i]).getUnderlying() instanceof RecurrentLayer){
-        ((RecurrentLayer) ((BaseWrapperLayer)layers[i]).getUnderlying()).rnnClearPreviousState();
-      }
-    }
-  }
-
-
-
-  /** Remove the mask arrays from all layers.<br>
-   * See {@link #setLayerMaskArrays(INDArray, INDArray)} for details on mask arrays.
-   */
-  public void clearLayerMaskArrays() {
-    Layer[] layers = conf.getLayers().toArray(new Layer[]{});
-    for (Layer layer : layers) {
-      layer.setMaskArray(null);
-    }
-  }
-
-  /**
-   * Increment the epoch count (in the underlying {@link MultiLayerConfiguration} by 1).
-   * Note that this is done <i>automatically</i> when using iterator-based fitting methods, such as
-   * {@link #fit(DataSetIterator)}. However, when using non-iterator fit methods (DataSet, INDArray/INDArray etc),
-   * the network has no way to know when one epoch ends and another starts. In such situations, this method
-   * can be used to increment the epoch counter.<br>
-   * Note that the epoch counter is used for situations such as some learning rate schedules, and the like.
-   *
-   * The current epoch count can be obtained using {@code MultiLayerConfiguration.getLayerwiseConfiguration().getEpochCount()}
-   */
-  public void incrementEpochCount(){
-    conf.setEpochCount(conf.getEpochCount() + 1);
-    synchronizeIterEpochCounts();
-  }
-
-  protected void synchronizeIterEpochCounts() {
-    //TODO: this is necessary for some schedules - but the redundant values are a little ugly...
-    int currIter = conf.getIterationCount();
-    int currEpoch = conf.getEpochCount();
-    log.error("Something went wrong here. Code incomplete");
-    /*for(Layer l : conf.getLayers()) {
-      l.setIterationCount(currIter);
-      l.setEpochCount(currEpoch);
-    }
-    */
-  }
-
-  /**
-   * This method just makes sure there's no state preserved within layers
-   */
-  public void clearLayersStates() {
-    for (Layer layer : layers) {
-      layer.clear();
-      layer.clearNoiseWeightParams();
-    }
-  }
-
-
-  /**Set the mask arrays for features and labels. Mask arrays are typically used in situations such as one-to-many
-   * and many-to-one learning with recurrent neural networks, as well as for supporting time series of varying lengths
-   * within the same minibatch.<br>
-   * For example, with RNN data sets with input of shape [miniBatchSize,nIn,timeSeriesLength] and outputs of shape
-   * [miniBatchSize,nOut,timeSeriesLength], the features and mask arrays will have shape [miniBatchSize,timeSeriesLength]
-   * and contain values 0 or 1 at each element (to specify whether a given input/example is present - or merely padding -
-   * at a given time step).<br>
-   * <b>NOTE</b>: This method is not usually used directly. Instead, methods such as @link #feedForward(INDArray, INDArray, INDArray)}
-   * and @link #output(INDArray, boolean, INDArray, INDArray)} handle setting of masking internally.
-   * @param featuresMaskArray Mask array for features (input)
-   * @param labelsMaskArray Mask array for labels (output)
-   * @see #clearLayerMaskArrays()
-   */
-  public void setLayerMaskArrays(INDArray featuresMaskArray, INDArray labelsMaskArray) {
-    if (featuresMaskArray != null) {
-
-      if (featuresMaskArray.size(0) > Integer.MAX_VALUE)
-        throw new ND4JArraySizeException();
-      //New approach: use feedForwardMaskArray method
-      feedForwardMaskArray(featuresMaskArray, MaskState.Active, (int) featuresMaskArray.size(0));
-
-
-            /*
-            //feedforward layers below a RNN layer: need the input (features) mask array
-            //Reason: even if the time series input is zero padded, the output from the dense layers are
-            // non-zero (i.e., activationFunction(0*weights + bias) != 0 in general)
-            //This assumes that the time series input is masked - i.e., values are 0 at the padded time steps,
-            // so we don't need to do anything for the recurrent layer
-
-            //Now, if mask array is 2d -> need to reshape to 1d (column vector) in the exact same order
-            // as is done for 3d -> 2d time series reshaping
-            INDArray reshapedFeaturesMask = TimeSeriesUtils.reshapeTimeSeriesMaskToVector(featuresMaskArray);
-
-            for( int i=0; i<layers.length-1; i++ ){
-                Type t = layers[i].type();
-                if( t == Type.CONVOLUTIONAL || t == Type.FEED_FORWARD ){
-                    layers[i].setMaskArray(reshapedFeaturesMask);
-                } else if( t == Type.RECURRENT ) break;
-
-            }
-            */
-    }
-    if (labelsMaskArray != null) {
-      if (!(getOutputLayer() instanceof IOutputLayer))
-        return;
-      layers[layers.length - 1].setMaskArray(labelsMaskArray);
-    }
-  }
-
-
-  /**
-   * Get the output layer - i.e., the last layer in the netwok
-   *
-   * @return
-   */
-  public Layer getOutputLayer() {
-    Layer ret = layers[layers.length - 1];
-    if (ret instanceof FrozenLayerWithBackprop) {
-      ret = ((FrozenLayerWithBackprop) ret).getInsideLayer();
-    }
-    return ret;
-  }
-
-
-
-  public Pair<INDArray, MaskState> feedForwardMaskArray(INDArray maskArray, MaskState currentMaskState,
-      int minibatchSize) {
-    if (maskArray == null) {
-      for (int i = 0; i < layers.length; i++) {
-        layers[i].feedForwardMaskArray(null, null, minibatchSize);
-      }
-    } else {
-      //Do a forward pass through each preprocessor and layer
-      for (int i = 0; i < layers.length; i++) {
-        InputPreProcessor preProcessor = conf.getInputPreProcessors().get(i);
-
-        if (preProcessor != null) {
-          Pair<INDArray, MaskState> p =
-              preProcessor.feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
-          if (p != null) {
-            maskArray = p.getFirst();
-            currentMaskState = p.getSecond();
-          } else {
-            maskArray = null;
-            currentMaskState = null;
-          }
-        }
-
-        Pair<INDArray, MaskState> p =
-            layers[i].feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
-        if (p != null) {
-          maskArray = p.getFirst();
-          currentMaskState = p.getSecond();
-        } else {
-          maskArray = null;
-          currentMaskState = null;
-        }
-      }
-    }
-
-    return new Pair<>(maskArray, currentMaskState);
-  }
-
-
-}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/networks/ArtificialNeuralNetwork.java b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/networks/ArtificialNeuralNetwork.java
new file mode 100644
index 000000000..0a605b94f
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/net/brutex/ai/dnn/networks/ArtificialNeuralNetwork.java
@@ -0,0 +1,53 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.networks;
+
+import lombok.Getter;
+import lombok.Setter;
+import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
+import net.brutex.ai.dnn.api.INeuralNetwork;
+
+/**
+ * Artificial Neural Network An artificial neural network (1) takes some input data, and (2)
+ * transforms this input data by calculating a weighted sum over the inputs and (3) applies a
+ * non-linear function to this transformation to calculate an intermediate state. The three steps
+ * above constitute what is known as a layer, and the transformative function is often referred to
+ * as a unit. The intermediate states—often termed features—are used as the input into another
+ * layer.
+ * <p>
+ * Through repetition of these steps, the artificial neural network learns multiple layers of
+ * non-linear features, which it then combines in a final layer to create a prediction.
+ * <p>
+ * The neural network learns by generating an error signal that measures the difference between the
+ * predictions of the network and the desired values and then using this error signal to change the
+ * weights (or parameters) so that predictions get more accurate.
+ */
+public abstract class ArtificialNeuralNetwork implements INeuralNetwork {
+
+  /**
+   * A neural network is created from a configuration.
+   * @param conf The (new net.brutex.ai) configuration for the network
+   */
+  @Getter
+  @Setter //TODO make this also final and @NonNull
+  private NeuralNetworkConfiguration configuration;
+}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/earlystopping/trainer/BaseEarlyStoppingTrainer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/earlystopping/trainer/BaseEarlyStoppingTrainer.java
index a39a08d97..4d6ff7675 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/earlystopping/trainer/BaseEarlyStoppingTrainer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/earlystopping/trainer/BaseEarlyStoppingTrainer.java
@@ -346,7 +346,7 @@ public abstract class BaseEarlyStoppingTrainer<T extends Model> implements IEarl
         } else if(model instanceof ComputationGraph){
             ComputationGraph cg = ((ComputationGraph) model);
             listeners = cg.getListeners();
-            cg.getConfiguration().setEpochCount(epochNum);
+            cg.getComputationGraphConfiguration().setEpochCount(epochNum);
         } else {
             return;
         }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/gradientcheck/GradientCheckUtil.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/gradientcheck/GradientCheckUtil.java
index 121102214..696e92bc2 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/gradientcheck/GradientCheckUtil.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/gradientcheck/GradientCheckUtil.java
@@ -431,7 +431,7 @@ public class GradientCheckUtil {
                             + "DataTypeUtil.setDTypeForContext(DataType.DOUBLE); before using GradientCheckUtil");
         }
 
-        DataType netDataType = c.net.getConfiguration().getDataType();
+        DataType netDataType = c.net.getComputationGraphConfiguration().getDataType();
         if (netDataType != DataType.DOUBLE) {
             throw new IllegalStateException("Cannot perform gradient check: Network datatype is not set to double precision ("
                     + "is: " + netDataType + "). Double precision must be used for gradient checks. Create network with .dataType(DataType.DOUBLE) before using GradientCheckUtil");
@@ -444,8 +444,8 @@ public class GradientCheckUtil {
 
         //Check configuration
         int layerCount = 0;
-        for (String vertexName : c.net.getConfiguration().getVertices().keySet()) {
-            GraphVertex gv = c.net.getConfiguration().getVertices().get(vertexName);
+        for (String vertexName : c.net.getComputationGraphConfiguration().getVertices().keySet()) {
+            GraphVertex gv = c.net.getComputationGraphConfiguration().getVertices().get(vertexName);
             if (!(gv instanceof LayerVertex))
                 continue;
             LayerVertex lv = (LayerVertex) gv;
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Layer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Layer.java
index 60780ab99..e7500055f 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Layer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Layer.java
@@ -32,194 +32,209 @@ import org.nd4j.common.primitives.Pair;
 import java.io.Serializable;
 import java.util.Collection;
 
+/**
+ * A layer is the highest-level building block in deep learning. A layer is a container that usually
+ * receives weighted input, transforms it with a set of mostly non-linear functions and then passes
+ * these values as output to the next layer. A layer is usually uniform, that is it only contains
+ * one type of activation function, pooling, convolution etc. so that it can be easily compared to
+ * other parts of the network. The first and last layers in a network are called input and output
+ * layers, respectively, and all layers in between are called hidden layers.
+ *
+ * @see <a href="https://developer.nvidia.com/blog/deep-learning-nutshell-core-concept">NVIDIA Deep Learning In A Nutshell</a>
+ */
 public interface Layer extends Serializable, Cloneable, Model, Trainable {
 
-    enum Type {
-        FEED_FORWARD, RECURRENT, CONVOLUTIONAL, CONVOLUTIONAL3D,
-        SUBSAMPLING, UPSAMPLING, RECURSIVE, MULTILAYER, NORMALIZATION
-    }
+  /**
+   * This method sets given CacheMode for current layer
+   *
+   * @param mode
+   */
+  void setCacheMode(CacheMode mode);
 
-    enum TrainingMode {
-        TRAIN, TEST
-    }
+  /**
+   * Calculate the regularization component of the score, for the parameters in this layer<br> For
+   * example, the L1, L2 and/or weight decay components of the loss function<br>
+   *
+   * @param backpropOnlyParams If true: calculate regularization score based on backprop params
+   *                           only. If false: calculate based on all params (including pretrain
+   *                           params, if any)
+   * @return the regularization score of
+   */
+  double calcRegularizationScore(boolean backpropOnlyParams);
 
-    /**
-     * This method sets given CacheMode for current layer
-     *
-     * @param mode
-     */
-    void setCacheMode(CacheMode mode);
+  /**
+   * Returns the layer type
+   *
+   * @return
+   */
+  Type type();
 
-    /**
-     * Calculate the regularization component of the score, for the parameters in this layer<br>
-     * For example, the L1, L2 and/or weight decay components of the loss function<br>
-     *
-     * @param backpropOnlyParams If true: calculate regularization score based on backprop params only. If false: calculate
-     *                           based on all params (including pretrain params, if any)
-     * @return the regularization score of
-     */
-    double calcRegularizationScore(boolean backpropOnlyParams);
+  /**
+   * Calculate the gradient relative to the error in the next layer
+   *
+   * @param epsilon      w^(L+1)*delta^(L+1). Or, equiv: dC/da, i.e., (dC/dz)*(dz/da) = dC/da, where
+   *                     C is cost function a=sigma(z) is activation.
+   * @param workspaceMgr Workspace manager
+   * @return Pair<Gradient, INDArray> where Gradient is gradient for this layer, INDArray is
+   * epsilon (activation gradient) needed by next layer, but before element-wise multiply by
+   * sigmaPrime(z). So for standard feed-forward layer, if this layer is L, then return.getSecond()
+   * == dL/dIn = (w^(L)*(delta^(L))^T)^T. Note that the returned array should be placed in the
+   * {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATION_GRAD} workspace via the workspace
+   * manager
+   */
+  Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr);
 
-    /**
-     * Returns the layer type
-     *
-     * @return
-     */
-    Type type();
+  /**
+   * Perform forward pass and return the activations array with the last set input
+   *
+   * @param training     training or test mode
+   * @param workspaceMgr Workspace manager
+   * @return the activation (layer output) of the last specified input. Note that the returned array
+   * should be placed in the {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATIONS} workspace
+   * via the workspace manager
+   */
+  INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr);
+
+  /**
+   * Perform forward pass and return the activations array with the specified input
+   *
+   * @param input    the input to use
+   * @param training train or test mode
+   * @param mgr      Workspace manager.
+   * @return Activations array. Note that the returned array should be placed in the
+   * {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATIONS} workspace via the workspace
+   * manager
+   */
+  INDArray activate(INDArray input, boolean training, LayerWorkspaceMgr mgr);
+
+  /**
+   * Get the iteration listeners for this layer.
+   */
+  Collection<TrainingListener> getListeners();
+
+  /**
+   * Set the {@link TrainingListener}s for this model. If any listeners have previously been set,
+   * they will be replaced by this method
+   */
+  void setListeners(TrainingListener... listeners);
+
+  /**
+   * Set the {@link TrainingListener}s for this model. If any listeners have previously been set,
+   * they will be replaced by this method
+   */
+  void setListeners(Collection<TrainingListener> listeners);
+
+  /**
+   * Get the layer index.
+   */
+  int getIndex();
+
+  /**
+   * Set the layer index.
+   */
+  void setIndex(int index);
+
+  /**
+   * @return The current iteration count (number of parameter updates) for the layer/network
+   */
+  int getIterationCount();
+
+  /**
+   * Set the current iteration count (number of parameter updates) for the layer/network
+   */
+  void setIterationCount(int iterationCount);
+
+  /**
+   * @return The current epoch count (number of training epochs passed) for the layer/network
+   */
+  int getEpochCount();
+
+  /**
+   * Set the current epoch count (number of epochs passed ) for the layer/network
+   */
+  void setEpochCount(int epochCount);
+
+  /**
+   * Set the layer input.
+   */
+  void setInput(INDArray input, LayerWorkspaceMgr workspaceMgr);
+
+  /**
+   * Get current/last input mini-batch size, as set by setInputMiniBatchSize(int)
+   *
+   * @see Layer#setInputMiniBatchSize(int)
+   */
+  int getInputMiniBatchSize();
+
+  /**
+   * Set current/last input mini-batch size.<br> Used for score and gradient calculations. Mini
+   * batch size may be different from getInput().size(0) due to reshaping operations - for example,
+   * when using RNNs with DenseLayerConfiguration and OutputLayer. Called automatically during
+   * forward pass.
+   */
+  void setInputMiniBatchSize(int size);
+
+  INDArray getMaskArray();
+
+  /**
+   * Set the mask array. Note: In general, {@link #feedForwardMaskArray(INDArray, MaskState, int)}
+   * should be used in preference to this.
+   *
+   * @param maskArray Mask array to set
+   */
+  void setMaskArray(INDArray maskArray);
+
+  /**
+   * Returns true if the layer can be trained in an unsupervised/pretrain manner (AE, VAE, etc)
+   *
+   * @return true if the layer can be pretrained (using fit(INDArray), false otherwise
+   */
+  boolean isPretrainLayer();
+
+  void clearNoiseWeightParams();
+
+  /**
+   * A performance optimization: mark whether the layer is allowed to modify its input array
+   * in-place. In many cases, this is totally safe - in others, the input array will be shared by
+   * multiple layers, and hence it's not safe to modify the input array. This is usually used by ops
+   * such as dropout.
+   *
+   * @param allow If true: the input array is safe to modify. If false: the input array should be
+   *              copied before it is modified (i.e., in-place modifications are un-safe)
+   */
+  void allowInputModification(boolean allow);
+
+  /**
+   * Feed forward the input mask array, setting in the layer as appropriate. This allows different
+   * layers to handle masks differently - for example, bidirectional RNNs and normal RNNs operate
+   * differently with masks (the former sets activations to 0 outside of the data present region
+   * (and keeps the mask active for future layers like dense layers), whereas normal RNNs don't zero
+   * out the activations/errors )instead relying on backpropagated error arrays to handle the
+   * variable length case.<br> This is also used for example for networks that contain global
+   * pooling layers, arbitrary preprocessors, etc.
+   *
+   * @param maskArray        Mask array to set
+   * @param currentMaskState Current state of the mask - see {@link MaskState}
+   * @param minibatchSize    Current minibatch size. Needs to be known as it cannot always be
+   *                         inferred from the activations array due to reshaping (such as a
+   *                         DenseLayerConfiguration within a recurrent neural network)
+   * @return New mask array after this layer, along with the new mask state.
+   */
+  Pair<INDArray, MaskState> feedForwardMaskArray(INDArray maskArray, MaskState currentMaskState,
+      int minibatchSize);
+
+  /**
+   * @return Get the layer helper, if any
+   */
+  LayerHelper getHelper();
 
 
-    /**
-     * Calculate the gradient relative to the error in the next layer
-     *
-     * @param epsilon      w^(L+1)*delta^(L+1). Or, equiv: dC/da, i.e., (dC/dz)*(dz/da) = dC/da, where C
-     *                     is cost function a=sigma(z) is activation.
-     * @param workspaceMgr Workspace manager
-     * @return Pair<Gradient   ,   INDArray> where Gradient is gradient for this layer, INDArray is epsilon (activation gradient)
-     * needed by next layer, but before element-wise multiply by sigmaPrime(z). So for standard feed-forward layer, if this layer is
-     * L, then return.getSecond() == dL/dIn = (w^(L)*(delta^(L))^T)^T. Note that the returned array should be placed in the
-     * {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATION_GRAD} workspace via the workspace manager
-     */
-    Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr);
+  enum Type {
+    FEED_FORWARD, RECURRENT, CONVOLUTIONAL, CONVOLUTIONAL3D,
+    SUBSAMPLING, UPSAMPLING, RECURSIVE, MULTILAYER, NORMALIZATION
+  }
 
-
-    /**
-     * Perform forward pass and return the activations array with the last set input
-     *
-     * @param training     training or test mode
-     * @param workspaceMgr Workspace manager
-     * @return the activation (layer output) of the last specified input. Note that the returned array should be placed
-     * in the {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATIONS} workspace via the workspace manager
-     */
-    INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr);
-
-    /**
-     * Perform forward pass and return the activations array with the specified input
-     *
-     * @param input    the input to use
-     * @param training train or test mode
-     * @param mgr      Workspace manager.
-     * @return Activations array. Note that the returned array should be placed in the
-     * {@link org.deeplearning4j.nn.workspace.ArrayType#ACTIVATIONS} workspace via the workspace manager
-     */
-    INDArray activate(INDArray input, boolean training, LayerWorkspaceMgr mgr);
-
-    /**
-     * Get the iteration listeners for this layer.
-     */
-    Collection<TrainingListener> getListeners();
-
-    /**
-     * Set the {@link TrainingListener}s for this model. If any listeners have previously been set, they will be
-     * replaced by this method
-     */
-    void setListeners(TrainingListener... listeners);
-
-    /**
-     * Set the {@link TrainingListener}s for this model. If any listeners have previously been set, they will be
-     * replaced by this method
-     */
-    void setListeners(Collection<TrainingListener> listeners);
-
-    /**
-     * Set the layer index.
-     */
-    void setIndex(int index);
-
-    /**
-     * Get the layer index.
-     */
-    int getIndex();
-
-    /**
-     * @return The current iteration count (number of parameter updates) for the layer/network
-     */
-    int getIterationCount();
-
-    /**
-     * @return The current epoch count (number of training epochs passed) for the layer/network
-     */
-    int getEpochCount();
-
-    /**
-     * Set the current iteration count (number of parameter updates) for the layer/network
-     */
-    void setIterationCount(int iterationCount);
-
-    /**
-     * Set the current epoch count (number of epochs passed ) for the layer/network
-     */
-    void setEpochCount(int epochCount);
-
-    /**
-     * Set the layer input.
-     */
-    void setInput(INDArray input, LayerWorkspaceMgr workspaceMgr);
-
-    /**
-     * Set current/last input mini-batch size.<br>
-     * Used for score and gradient calculations. Mini batch size may be different from
-     * getInput().size(0) due to reshaping operations - for example, when using RNNs with
-     * DenseLayer and OutputLayer. Called automatically during forward pass.
-     */
-    void setInputMiniBatchSize(int size);
-
-    /**
-     * Get current/last input mini-batch size, as set by setInputMiniBatchSize(int)
-     *
-     * @see Layer#setInputMiniBatchSize(int)
-     */
-    int getInputMiniBatchSize();
-
-    /**
-     * Set the mask array. Note: In general, {@link #feedForwardMaskArray(INDArray, MaskState, int)} should be used in
-     * preference to this.
-     *
-     * @param maskArray Mask array to set
-     */
-    void setMaskArray(INDArray maskArray);
-
-
-    INDArray getMaskArray();
-
-    /**
-     * Returns true if the layer can be trained in an unsupervised/pretrain manner (AE, VAE, etc)
-     *
-     * @return true if the layer can be pretrained (using fit(INDArray), false otherwise
-     */
-    boolean isPretrainLayer();
-
-
-    void clearNoiseWeightParams();
-
-    /**
-     * A performance optimization: mark whether the layer is allowed to modify its input array in-place. In many cases,
-     * this is totally safe - in others, the input array will be shared by multiple layers, and hence it's not safe to
-     * modify the input array.
-     * This is usually used by ops such as dropout.
-     * @param allow If true: the input array is safe to modify. If false: the input array should be copied before it
-     *              is modified (i.e., in-place modifications are un-safe)
-     */
-    void allowInputModification(boolean allow);
-
-
-    /**
-     * Feed forward the input mask array, setting in the layer as appropriate. This allows different layers to
-     * handle masks differently - for example, bidirectional RNNs and normal RNNs operate differently with masks (the
-     * former sets activations to 0 outside of the data present region (and keeps the mask active for future layers like
-     * dense layers), whereas normal RNNs don't zero out the activations/errors )instead relying on backpropagated error
-     * arrays to handle the variable length case.<br>
-     * This is also used for example for networks that contain global pooling layers, arbitrary preprocessors, etc.
-     *
-     * @param maskArray        Mask array to set
-     * @param currentMaskState Current state of the mask - see {@link MaskState}
-     * @param minibatchSize    Current minibatch size. Needs to be known as it cannot always be inferred from the activations
-     *                         array due to reshaping (such as a DenseLayer within a recurrent neural network)
-     * @return New mask array after this layer, along with the new mask state.
-     */
-    Pair<INDArray, MaskState> feedForwardMaskArray(INDArray maskArray, MaskState currentMaskState, int minibatchSize);
-
-    /**
-     * @return Get the layer helper, if any
-     */
-    LayerHelper getHelper();
+  enum TrainingMode {
+    TRAIN, TEST
+  }
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ModelAdapter.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ModelAdapter.java
index 8b7d816d6..01a60b73e 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ModelAdapter.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ModelAdapter.java
@@ -25,7 +25,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
 
 public interface ModelAdapter<T> extends OutputAdapter<T> {
     /**
-     * This method invokes model internally, and does convertion to T
+     * This method invokes model internally, and does conversion to T
      * @return
      */
     T apply(Model model, INDArray[] inputs, INDArray[] inputMasks, INDArray[] labelsMasks);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ParamInitializer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ParamInitializer.java
index 7170953e9..7b6483483 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ParamInitializer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/ParamInitializer.java
@@ -41,7 +41,7 @@ public interface ParamInitializer {
     /**
      * Get a list of all parameter keys given the layer configuration
      *
-     * @param layer Layer
+     * @param layer ILayer
      * @return All parameter keys
      */
     List<String> paramKeys(org.deeplearning4j.nn.conf.layers.Layer layer);
@@ -49,7 +49,7 @@ public interface ParamInitializer {
     /**
      * Weight parameter keys given the layer configuration
      *
-     * @param layer Layer
+     * @param layer ILayer
      * @return Weight parameter keys
      */
     List<String> weightKeys(org.deeplearning4j.nn.conf.layers.Layer layer);
@@ -57,7 +57,7 @@ public interface ParamInitializer {
     /**
      * Bias parameter keys given the layer configuration
      *
-     * @param layer Layer
+     * @param layer ILayer
      * @return Bias parameter keys
      */
     List<String> biasKeys(org.deeplearning4j.nn.conf.layers.Layer layer);
@@ -65,7 +65,7 @@ public interface ParamInitializer {
     /**
      * Is the specified parameter a weight?
      *
-     * @param layer Layer
+     * @param layer ILayer
      * @param key Key to check
      * @return True if parameter is a weight
      */
@@ -74,7 +74,7 @@ public interface ParamInitializer {
     /**
      * Is the specified parameter a bias?
      *
-     * @param layer Layer
+     * @param layer ILayer
      * @param key Key to check
      * @return True if parameter is a bias
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/TrainingConfig.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/TrainingConfig.java
index ae7601a6f..58f101260 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/TrainingConfig.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/TrainingConfig.java
@@ -47,7 +47,7 @@ public interface TrainingConfig {
      * Is the specified parameter a layerwise pretraining only parameter?<br>
      * For example, visible bias params in an autoencoder (or, decoder params in a variational autoencoder) aren't
      * used during supervised backprop.<br>
-     * Layers (like DenseLayer, etc) with no pretrainable parameters will return false for all (valid) inputs.
+     * Layers (like DenseLayerConfiguration, etc) with no pretrainable parameters will return false for all (valid) inputs.
      *
      * @param paramName Parameter name/key
      * @return True if the parameter is for layerwise pretraining only, false otherwise
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Updater.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Updater.java
index d63b57bb8..2c01298cb 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Updater.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/Updater.java
@@ -36,7 +36,7 @@ public interface Updater extends Serializable {
     /**
      * Set the internal (historical) state view array for this updater
      *
-     * @param layer      Layer that this updater belongs to
+     * @param layer      ILayer that this updater belongs to
      * @param viewArray  View array
      * @param initialize Whether to initialize the array or not
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/LayerConstraint.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/LayerConstraint.java
index fff8bd77d..cfa82b050 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/LayerConstraint.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/LayerConstraint.java
@@ -33,7 +33,7 @@ public interface LayerConstraint extends Cloneable, Serializable {
      * Apply a given constraint to a layer at each iteration
      * in the provided epoch, after parameters have been updated.
      *
-     * @param layer org.deeplearning4j.nn.api.Layer
+     * @param layer org.deeplearning4j.nn.api.ILayer
      * @param iteration given iteration as integer
      * @param epoch current epoch as integer
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java
index 62050b88e..a4f73d3b0 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java
@@ -66,10 +66,10 @@ public interface RecurrentLayer extends Layer {
      * (a) result in the same output<br>
      * (b) leave the state maps (both stateMap and tBpttStateMap) in an identical state
      *
-     * @param input             Layer input
+     * @param input             ILayer input
      * @param training          if true: training. Otherwise: test
      * @param storeLastForTBPTT If true: store the final state in tBpttStateMap for use in truncated BPTT training
-     * @return Layer activations
+     * @return ILayer activations
      */
     INDArray rnnActivateUsingStoredState(INDArray input, boolean training, boolean storeLastForTBPTT, LayerWorkspaceMgr workspaceMg);
 
@@ -92,7 +92,7 @@ public interface RecurrentLayer extends Layer {
     void rnnSetTBPTTState(Map<String, INDArray> state);
 
     /**
-     * Truncated BPTT equivalent of Layer.backpropGradient().
+     * Truncated BPTT equivalent of ILayer.backpropGradient().
      * Primary difference here is that forward pass in the context of BPTT is that we do
      * forward pass using stored state for truncated BPTT vs. from zero initialization
      * for standard BPTT.
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/NeuralNetConfiguration.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/NeuralNetConfiguration.java
index 69ff898e2..f44a8f3ab 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/NeuralNetConfiguration.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/NeuralNetConfiguration.java
@@ -25,6 +25,7 @@ import lombok.EqualsAndHashCode;
 import lombok.NoArgsConstructor;
 import lombok.NonNull;
 import lombok.extern.slf4j.Slf4j;
+import net.brutex.ai.dnn.api.INeuralNetworkConfiguration;
 import org.deeplearning4j.nn.api.OptimizationAlgorithm;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.distribution.Distribution;
@@ -68,7 +69,9 @@ import java.util.*;
 @NoArgsConstructor
 @Slf4j
 @EqualsAndHashCode(exclude = {"iterationCount", "epochCount"})
-public class NeuralNetConfiguration implements Serializable, Cloneable {
+public class NeuralNetConfiguration implements Serializable, Cloneable,
+    INeuralNetworkConfiguration {
+
 
     protected Layer layer;
     //batch size: primarily used for conv nets. Will be reinforced if set.
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MaxNormConstraint.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MaxNormConstraint.java
index 43fdc4254..a38e6dfcf 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MaxNormConstraint.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MaxNormConstraint.java
@@ -43,7 +43,7 @@ public class MaxNormConstraint extends BaseConstraint {
     /**
      * @param maxNorm        Maximum L2 value
      * @param paramNames     Which parameter names to apply constraint to
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
@@ -56,7 +56,7 @@ public class MaxNormConstraint extends BaseConstraint {
      * Apply to weights but not biases by default
      *
      * @param maxNorm        Maximum L2 value
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MinMaxNormConstraint.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MinMaxNormConstraint.java
index 6449a9abd..ca43d4ca0 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MinMaxNormConstraint.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/MinMaxNormConstraint.java
@@ -51,7 +51,7 @@ public class MinMaxNormConstraint extends BaseConstraint {
      *
      * @param max            Maximum L2 value
      * @param min            Minimum L2 value
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
@@ -65,7 +65,7 @@ public class MinMaxNormConstraint extends BaseConstraint {
      * @param max            Maximum L2 value
      * @param min            Minimum L2 value
      * @param rate           Constraint rate
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
@@ -79,7 +79,7 @@ public class MinMaxNormConstraint extends BaseConstraint {
      * @param min            Minimum L2 value
      * @param rate           Constraint rate
      * @param paramNames     Which parameter names to apply constraint to
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/UnitNormConstraint.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/UnitNormConstraint.java
index a082056a7..3e80f341b 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/UnitNormConstraint.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/constraint/UnitNormConstraint.java
@@ -39,7 +39,7 @@ public class UnitNormConstraint extends BaseConstraint {
     /**
      * Apply to weights but not biases by default
      *
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
@@ -49,7 +49,7 @@ public class UnitNormConstraint extends BaseConstraint {
 
 
     /**
-     * @param dimensions     Dimensions to apply to. For DenseLayer, OutputLayer, RnnOutputLayer, LSTM, etc: this should
+     * @param dimensions     Dimensions to apply to. For DenseLayerConfiguration, OutputLayer, RnnOutputLayer, LSTM, etc: this should
      *                       be dimension 1. For CNNs, this should be dimensions [1,2,3] corresponding to last 3 of
      *                       parameters which have order [depthOut, depthIn, kH, kW]
      */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/graph/LayerVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/graph/LayerVertex.java
index b1734682d..0c7565db1 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/graph/LayerVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/graph/LayerVertex.java
@@ -21,7 +21,6 @@
 package org.deeplearning4j.nn.conf.graph;
 
 import lombok.Data;
-import lombok.EqualsAndHashCode;
 import lombok.NoArgsConstructor;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@@ -40,8 +39,8 @@ public class LayerVertex extends GraphVertex {
 
     private NeuralNetConfiguration layerConf;
     private InputPreProcessor preProcessor;
-    //Set outputVertex to true when Layer is an OutputLayer, OR For use in specialized situations like reinforcement learning
-    // For RL situations, this Layer insn't an OutputLayer, but is the last layer in a graph, that gets its error/epsilon
+    //Set outputVertex to true when ILayer is an OutputLayer, OR For use in specialized situations like reinforcement learning
+    // For RL situations, this ILayer insn't an OutputLayer, but is the last layer in a graph, that gets its error/epsilon
     // passed in externally
     private boolean outputVertex;
 
@@ -99,7 +98,7 @@ public class LayerVertex extends GraphVertex {
     public org.deeplearning4j.nn.graph.vertex.GraphVertex instantiate(ComputationGraph graph, String name, int idx,
                                                                       INDArray paramsView, boolean initializeParams, DataType networkDatatype) {
         //Now, we need to work out if this vertex is an output vertex or not...
-        boolean isOutput = graph.getConfiguration().getNetworkOutputs().contains(name);
+        boolean isOutput = graph.getComputationGraphConfiguration().getNetworkOutputs().contains(name);
 
         org.deeplearning4j.nn.api.Layer layer =
                         layerConf.getLayer().instantiate(layerConf, null, idx, paramsView, initializeParams, networkDatatype);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ActivationLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ActivationLayer.java
index 0fb559c74..0b10cedd4 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ActivationLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ActivationLayer.java
@@ -134,7 +134,7 @@ public class ActivationLayer extends NoParamLayer {
         private IActivation activationFn = null;
 
         /**
-         * Layer activation function. Typical values include:<br> "relu" (rectified linear), "tanh", "sigmoid",
+         * ILayer activation function. Typical values include:<br> "relu" (rectified linear), "tanh", "sigmoid",
          * "softmax", "hardtanh", "leakyrelu", "maxout", "softsign", "softplus"
          *
          * @deprecated Use {@link #activation(Activation)} or {@link @activation(IActivation)}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseLayer.java
index fc751e91b..6aad5b0ef 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseLayer.java
@@ -176,7 +176,7 @@ public abstract class BaseLayer extends Layer implements Serializable, Cloneable
         protected double biasInit = Double.NaN;
 
         /**
-         * Gain initialization value, for layers with Layer Normalization. Defaults to 1
+         * Gain initialization value, for layers with ILayer Normalization. Defaults to 1
          *
          */
         protected double gainInit = Double.NaN;
@@ -292,7 +292,7 @@ public abstract class BaseLayer extends Layer implements Serializable, Cloneable
         }
 
         /**
-         * Gain initialization value, for layers with Layer Normalization. Defaults to 1
+         * Gain initialization value, for layers with ILayer Normalization. Defaults to 1
          *
          * @param gainInit Value to use for initializing gain
          */
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/CapsuleLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/CapsuleLayer.java
index c6f31faf3..4081930c9 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/CapsuleLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/CapsuleLayer.java
@@ -63,14 +63,14 @@ public class CapsuleLayer extends SameDiffLayer {
         this.routings = builder.routings;
 
         if(capsules <= 0 || capsuleDimensions <= 0 || routings <= 0){
-            throw new IllegalArgumentException("Invalid configuration for Capsule Layer (layer name = \""
+            throw new IllegalArgumentException("Invalid configuration for Capsule ILayer (layer name = \""
                     + layerName + "\"):"
                     + " capsules, capsuleDimensions, and routings must be > 0.  Got: "
                     + capsules + ", " + capsuleDimensions + ", " + routings);
         }
 
         if(inputCapsules < 0 || inputCapsuleDimensions < 0){
-            throw new IllegalArgumentException("Invalid configuration for Capsule Layer (layer name = \""
+            throw new IllegalArgumentException("Invalid configuration for Capsule ILayer (layer name = \""
                     + layerName + "\"):"
                     + " inputCapsules and inputCapsuleDimensions must be >= 0 if set.  Got: "
                     + inputCapsules + ", " + inputCapsuleDimensions);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/DenseLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/DenseLayer.java
index d77f13e5c..1a6ce905c 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/DenseLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/DenseLayer.java
@@ -55,7 +55,7 @@ public class DenseLayer extends FeedForwardLayer {
     @Override
     public Layer instantiate(NeuralNetConfiguration conf, Collection<TrainingListener> trainingListeners,
                              int layerIndex, INDArray layerParamsView, boolean initializeParams, DataType networkDataType) {
-        LayerValidation.assertNInNOutSet("DenseLayer", getLayerName(), layerIndex, getNIn(), getNOut());
+        LayerValidation.assertNInNOutSet("DenseLayerConfiguration", getLayerName(), layerIndex, getNIn(), getNOut());
 
         org.deeplearning4j.nn.layers.feedforward.dense.DenseLayer ret =
                         new org.deeplearning4j.nn.layers.feedforward.dense.DenseLayer(conf, networkDataType);
@@ -101,7 +101,7 @@ public class DenseLayer extends FeedForwardLayer {
         return new LayerMemoryReport.Builder(layerName, DenseLayer.class, inputType, outputType)
                         .standardMemory(numParams, updaterStateSize)
                         .workingMemory(0, 0, trainSizeFixed, trainSizeVariable) //No additional memory (beyond activations) for inference
-                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer
+                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayerConfiguration
                         .build();
     }
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Layer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Layer.java
index a96ec6db7..66f48dd14 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Layer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Layer.java
@@ -205,7 +205,7 @@ public abstract class Layer implements TrainingConfig, Serializable, Cloneable {
     /**
      * Is the specified parameter a layerwise pretraining only parameter?<br> For example, visible
      * bias params in an autoencoder (or, decoder params in a variational autoencoder) aren't used
-     * during supervised backprop.<br> Layers (like DenseLayer, etc) with no pretrainable parameters
+     * during supervised backprop.<br> Layers (like DenseLayerConfiguration, etc) with no pretrainable parameters
      * will return false for all (valid) inputs.
      *
      * @param paramName Parameter name/key
@@ -255,7 +255,7 @@ public abstract class Layer implements TrainingConfig, Serializable, Cloneable {
         protected IDropout iDropout;
 
         /**
-         * Layer name assigns layer string name. Allows easier differentiation between layers.
+         * ILayer name assigns layer string name. Allows easier differentiation between layers.
          */
         public T name(String layerName) {
             this.setLayerName(layerName);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LayerValidation.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LayerValidation.java
index 2a5f16be6..571f884e3 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LayerValidation.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LayerValidation.java
@@ -42,7 +42,7 @@ public class LayerValidation {
     /**
      * Asserts that the layer nIn and nOut values are set for the layer
      *
-     * @param layerType     Type of layer ("DenseLayer", etc)
+     * @param layerType     Type of layer ("DenseLayerConfiguration", etc)
      * @param layerName     Name of the layer (may be null if not set)
      * @param layerIndex    Index of the layer
      * @param nIn           nIn value
@@ -60,7 +60,7 @@ public class LayerValidation {
     /**
      * Asserts that the layer nOut value is set for the layer
      *
-     * @param layerType     Type of layer ("DenseLayer", etc)
+     * @param layerType     Type of layer ("DenseLayerConfiguration", etc)
      * @param layerName     Name of the layer (may be null if not set)
      * @param layerIndex    Index of the layer
      * @param nOut          nOut value
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java
index 8648a2814..98d7fa093 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java
@@ -147,7 +147,7 @@ public class LocalResponseNormalization extends Layer {
 
         return new LayerMemoryReport.Builder(layerName, DenseLayer.class, inputType, inputType).standardMemory(0, 0)
                         .workingMemory(0, 2 * actElementsPerEx, 0, 3 * actElementsPerEx)
-                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer
+                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayerConfiguration
                         .build();
     }
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/PrimaryCapsules.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/PrimaryCapsules.java
index 2107bdede..4d3f56a84 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/PrimaryCapsules.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/PrimaryCapsules.java
@@ -87,7 +87,7 @@ public class PrimaryCapsules extends SameDiffLayer {
         }
 
         if(capsules < 0){
-            throw new IllegalArgumentException("Invalid configuration for Capsule Layer (layer name = \""
+            throw new IllegalArgumentException("Invalid configuration for Capsule ILayer (layer name = \""
                     + layerName + "\"):"
                     + " capsules must be >= 0 if set.  Got: "
                     + capsules);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/misc/ElementWiseMultiplicationLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/misc/ElementWiseMultiplicationLayer.java
index 79ab2ca54..9eea40cfc 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/misc/ElementWiseMultiplicationLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/misc/ElementWiseMultiplicationLayer.java
@@ -113,7 +113,7 @@ public class ElementWiseMultiplicationLayer extends org.deeplearning4j.nn.conf.l
         return new LayerMemoryReport.Builder(layerName, ElementWiseMultiplicationLayer.class, inputType, outputType)
                         .standardMemory(numParams, updaterStateSize)
                         .workingMemory(0, 0, trainSizeFixed, trainSizeVariable) //No additional memory (beyond activations) for inference
-                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer
+                        .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayerConfiguration
                         .build();
     }
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java
index d6004f6bb..54a93b904 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java
@@ -44,7 +44,7 @@ public class TimeDistributed extends BaseWrapperLayer {
     private RNNFormat rnnDataFormat = RNNFormat.NCW;
 
     /**
-     * @param underlying Underlying (internal) layer - should be a feed forward type such as DenseLayer
+     * @param underlying Underlying (internal) layer - should be a feed forward type such as DenseLayerConfiguration
      */
     public TimeDistributed(@JsonProperty("underlying") @NonNull Layer underlying, @JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat) {
         super(underlying);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaLayer.java
index 51cdb3b6f..0b68bf649 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaLayer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaLayer.java
@@ -33,7 +33,7 @@ public abstract class SameDiffLambdaLayer extends SameDiffLayer {
      * The defineLayer method is used to define the forward pass for the layer
      *
      * @param sameDiff   SameDiff instance to use to define the vertex
-     * @param layerInput Layer input variable
+     * @param layerInput ILayer input variable
      * @return The output variable (corresponding to the output activations for the layer)
      */
     public abstract SDVariable defineLayer(SameDiff sameDiff, SDVariable layerInput);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaVertex.java
index d3c10ec2f..7ec4fb2d5 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLambdaVertex.java
@@ -37,7 +37,7 @@ public abstract class SameDiffLambdaVertex extends SameDiffVertex {
      * The defineVertex method is used to define the foward pass for the vertex
      *
      * @param sameDiff SameDiff instance to use to define the vertex
-     * @param inputs   Layer input variable
+     * @param inputs   ILayer input variable
      * @return The output variable (orresponding to the output activations for the vertex)
      */
     public abstract SDVariable defineVertex(SameDiff sameDiff, VertexInputs inputs);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/wrapper/BuildingBlockLayer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/wrapper/BuildingBlockLayer.java
deleted file mode 100644
index e150b850f..000000000
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/layers/wrapper/BuildingBlockLayer.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *
- *    ******************************************************************************
- *    *
- *    * This program and the accompanying materials are made available under the
- *    * terms of the Apache License, Version 2.0 which is available at
- *    * https://www.apache.org/licenses/LICENSE-2.0.
- *    *
- *    *  See the NOTICE file distributed with this work for additional
- *    *  information regarding copyright ownership.
- *    * Unless required by applicable law or agreed to in writing, software
- *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- *    * License for the specific language governing permissions and limitations
- *    * under the License.
- *    *
- *    * SPDX-License-Identifier: Apache-2.0
- *    *****************************************************************************
- *
- */
-
-package org.deeplearning4j.nn.conf.layers.wrapper;
-
-import java.util.Collection;
-import lombok.AccessLevel;
-import lombok.Builder;
-import lombok.Getter;
-import lombok.NonNull;
-import net.brutex.ai.dnn.api.LayerConfiguration;
-import net.brutex.ai.dnn.api.NeuralNetwork;
-import org.deeplearning4j.nn.api.Layer;
-import org.deeplearning4j.nn.api.ParamInitializer;
-import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
-import net.brutex.ai.dnn.conf.NeuralNetworkConfiguration;
-import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.BaseLayer;
-import org.deeplearning4j.nn.conf.memory.LayerMemoryReport;
-import org.deeplearning4j.optimize.api.TrainingListener;
-import org.nd4j.linalg.api.buffer.DataType;
-import org.nd4j.linalg.api.ndarray.INDArray;
-
-@Builder(builderClassName = "Builder", access = AccessLevel.PUBLIC)
-public class BuildingBlockLayer extends BaseLayer implements LayerConfiguration {
-
-  @NonNull
-  @Getter
-  private NeuralNetworkConfiguration conf;
-
-  @Override
-  public Layer instantiate(NeuralNetConfiguration conf,
-      Collection<TrainingListener> trainingListeners, int layerIndex, INDArray layerParamsView,
-      boolean initializeParams, DataType networkDataType) {
-    return null;
-  }
-
-  @Override
-  public ParamInitializer initializer() {
-    return null;
-  }
-
-  @Override
-  public InputType getOutputType(int layerIndex, InputType inputType) {
-    return null;
-  }
-
-  @Override
-  public void setNIn(InputType inputType, boolean override) {
-
-  }
-
-  @Override
-  public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-    return null;
-  }
-
-  @Override
-  public boolean isPretrainParam(String paramName) {
-    return false;
-  }
-
-  @Override
-  public LayerMemoryReport getMemoryReport(InputType inputType) {
-    return null;
-  }
-
-  /**
-   * Create and return an instance of a LayerConfiguration.
-   *
-   * @param network the "holding" network for the instance
-   * @return the new layer instance
-   */
-  @Override
-  public net.brutex.ai.dnn.api.Layer instantiate(NeuralNetwork network) {
-    return null;
-  }
-}
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/memory/NetworkMemoryReport.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/memory/NetworkMemoryReport.java
index 9182ccfb9..d3f7b1955 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/memory/NetworkMemoryReport.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/memory/NetworkMemoryReport.java
@@ -153,7 +153,7 @@ public class NetworkMemoryReport extends MemoryReport {
                         .append(modelName).append("\n").append("  Network Input:                      ")
                         .append(Arrays.toString(networkInputTypes)).append("\n")
                         .append("  # Layers:                           ").append(layerAndVertexReports.size())
-                        .append("\n").append("  Layer Types:                        ").append(sbLayerCounts)
+                        .append("\n").append("  ILayer Types:                        ").append(sbLayerCounts)
                         .append("\n");
 
         appendFixedPlusVariable(sb, "  Inference Memory (FP32)             ", fixedMemBytes, perEx);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/weightnoise/IWeightNoise.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/weightnoise/IWeightNoise.java
index 4c45b762f..c6c77d3d2 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/weightnoise/IWeightNoise.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/conf/weightnoise/IWeightNoise.java
@@ -33,7 +33,7 @@ public interface IWeightNoise extends Serializable, Cloneable{
     /**
      * Get the parameter, after applying weight noise
      *
-     * @param layer     Layer to get the parameter for
+     * @param layer     ILayer to get the parameter for
      * @param paramKey  Parameter key
      * @param iteration Iteration number
      * @param epoch     Epoch number
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
index ac8a05be4..4a080bb28 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
@@ -25,6 +25,8 @@ import lombok.NonNull;
 import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
+import net.brutex.ai.dnn.api.INeuralNetwork;
+import net.brutex.ai.dnn.networks.ArtificialNeuralNetwork;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.bytedeco.javacpp.Pointer;
@@ -103,9 +105,16 @@ import java.util.*;
 import java.util.concurrent.atomic.AtomicLong;
 
 @Slf4j
-public class ComputationGraph implements Serializable, Model, NeuralNetwork {
+public class ComputationGraph extends ArtificialNeuralNetwork implements Serializable, Model,
+    INeuralNetwork {
 
-    protected ComputationGraphConfiguration configuration;
+    /**
+     * This method returns configuration of this ComputationGraph
+     *
+     * @return
+     */
+    @Getter
+    protected ComputationGraphConfiguration computationGraphConfiguration;
     protected boolean initCalled = false;
     protected transient Solver solver; //Used to call optimizers during backprop
     protected INDArray flattenedParams; //Params for all layers are a view/subset of this array
@@ -210,17 +219,17 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     private Collection<TrainingListener> trainingListeners = new ArrayList<>();
 
 
-    public ComputationGraph(ComputationGraphConfiguration configuration) {
-        this.configuration = configuration;
-        this.numInputArrays = configuration.getNetworkInputs().size();
-        this.numOutputArrays = configuration.getNetworkOutputs().size();
+    public ComputationGraph(ComputationGraphConfiguration computationGraphConfiguration) {
+        this.computationGraphConfiguration = computationGraphConfiguration;
+        this.numInputArrays = computationGraphConfiguration.getNetworkInputs().size();
+        this.numOutputArrays = computationGraphConfiguration.getNetworkOutputs().size();
         this.inputs = new INDArray[numInputArrays];
         this.labels = new INDArray[numOutputArrays];
-        this.defaultConfiguration = configuration.getDefaultConfiguration();
+        this.defaultConfiguration = computationGraphConfiguration.getDefaultConfiguration();
 
         //Working memory: should learn over course of: (a) full forward pass, and (b) full backward pass
         //Working memory should be opened once per vertex, for each of forward and backward passes
-        int numWorkingMem = 2 * configuration.getVertices().size();
+        int numWorkingMem = 2 * computationGraphConfiguration.getVertices().size();
         WS_LAYER_WORKING_MEM_CONFIG = WorkspaceConfiguration.builder()
                 .initialSize(0)
                 .overallocationLimit(0.02)
@@ -238,7 +247,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                 .initialSize(0)
                 .overallocationLimit(0.02)
                 .policyLearning(LearningPolicy.OVER_TIME)
-                .cyclesBeforeInitialization(configuration.getVertices().size())
+                .cyclesBeforeInitialization(computationGraphConfiguration.getVertices().size())
                 .policyReset(ResetPolicy.BLOCK_LEFT)
                 .policySpill(SpillPolicy.REALLOCATE)
                 .policyAllocation(AllocationPolicy.OVERALLOCATE)
@@ -278,14 +287,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
     }
 
-    /**
-     * This method returns configuration of this ComputationGraph
-     *
-     * @return
-     */
-    public ComputationGraphConfiguration getConfiguration() {
-        return configuration;
-    }
+
 
     /**
      * Returns the number of layers in the ComputationGraph
@@ -313,7 +315,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      * Get a given layer by name.
      */
     public Layer getLayer(String name) {
-        Preconditions.checkState(verticesMap.containsKey(name), "Layer with name %s does not exist in the network", name);
+        Preconditions.checkState(verticesMap.containsKey(name), "ILayer with name %s does not exist in the network", name);
         return verticesMap.get(name).getLayer();
     }
 
@@ -449,7 +451,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         if (initCalled)
             return;
 
-        DataType netDtype = getConfiguration().getDataType();
+        DataType netDtype = this.getComputationGraphConfiguration().getDataType();
         if(parameters != null && parameters.dataType() != netDtype){
             Preconditions.checkState(parameters.rank() == 2 && parameters.size(0) == 1, "Invalid parameters array: should be rank 2 with shape [1,numParams]. Got %ndShape", parameters);
             if(cloneParametersArray){
@@ -463,31 +465,31 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             }
         }
 
-        if (configuration.getTrainingWorkspaceMode() == null)
-            configuration.setTrainingWorkspaceMode(WorkspaceMode.NONE);
+        if (computationGraphConfiguration.getTrainingWorkspaceMode() == null)
+            computationGraphConfiguration.setTrainingWorkspaceMode(WorkspaceMode.NONE);
 
-        if (configuration.getInferenceWorkspaceMode() == null)
-            configuration.setInferenceWorkspaceMode(WorkspaceMode.NONE);
+        if (computationGraphConfiguration.getInferenceWorkspaceMode() == null)
+            computationGraphConfiguration.setInferenceWorkspaceMode(WorkspaceMode.NONE);
 
-        if (configuration.getCacheMode() == null)
-            configuration.setCacheMode(CacheMode.NONE);
+        if (computationGraphConfiguration.getCacheMode() == null)
+            computationGraphConfiguration.setCacheMode(CacheMode.NONE);
 
         OneTimeLogger.info(log, "Starting ComputationGraph with WorkspaceModes set to [training: {}; inference: {}], cacheMode set to [{}]",
-                configuration.getTrainingWorkspaceMode(), configuration.getInferenceWorkspaceMode(), configuration.getCacheMode());
+                computationGraphConfiguration.getTrainingWorkspaceMode(), computationGraphConfiguration.getInferenceWorkspaceMode(), computationGraphConfiguration.getCacheMode());
 
         //First: build topological ordering, based on configuration. Used for forward pass, backprop and order of parameters/gradients
         GraphIndices indices = calculateIndices();
         topologicalOrder = indices.getTopologicalSortOrder();
 
         //Initialization: create the GraphVertex objects, based on configuration structure
-        Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> configVertexMap = configuration.getVertices();
+        Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> configVertexMap = computationGraphConfiguration.getVertices();
 
         //Names of all of the (data) inputs to the ComputationGraph
-        List<String> networkInputNames = configuration.getNetworkInputs();
+        List<String> networkInputNames = computationGraphConfiguration.getNetworkInputs();
 
         //Inputs for each layer and GraphNode:
-        Map<String, List<String>> vertexInputs = configuration.getVertexInputs();
-        this.vertices = new GraphVertex[networkInputNames.size() + configuration.getVertices().size()];
+        Map<String, List<String>> vertexInputs = computationGraphConfiguration.getVertexInputs();
+        this.vertices = new GraphVertex[networkInputNames.size() + computationGraphConfiguration.getVertices().size()];
 
         //All names: inputs, layers and graph nodes (index to name map)
         Map<String, Integer> allNamesReverse = new HashMap<>();
@@ -504,7 +506,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         long numParams = 0;
         long[] numParamsForVertex = new long[topologicalOrder.length];
         int i = 0;
-        for (; i < configuration.getNetworkInputs().size(); i++) {
+        for (; i < computationGraphConfiguration.getNetworkInputs().size(); i++) {
             numParamsForVertex[i] = 0; //No parameters for input vertices
         }
         for(; i < topologicalOrder.length; i++) {
@@ -513,7 +515,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             n.setDataType(netDtype);
             numParamsForVertex[i] = n.numParams(true);
             if(numParamsForVertex[i] < 0)
-                throw new DL4JInvalidConfigException("Layer " + name + " had parameters < 0 " + numParamsForVertex[i]);
+                throw new DL4JInvalidConfigException("ILayer " + name + " had parameters < 0 " + numParamsForVertex[i]);
             numParams += numParamsForVertex[i];
         }
 
@@ -564,7 +566,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         List<Layer> tempLayerList = new ArrayList<>();
         defaultConfiguration.clearVariables();
         List<String> variables = defaultConfiguration.variables(false);
-        i = configuration.getNetworkInputs().size();
+        i = computationGraphConfiguration.getNetworkInputs().size();
         for(; i<topologicalOrder.length; i++ ){
             String name = indices.getIdxToName().get(i);
             org.deeplearning4j.nn.conf.graph.GraphVertex n = configVertexMap.get(name);
@@ -679,7 +681,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
 
         //Mark any output vertices as outputs:
-        for (String s : configuration.getNetworkOutputs()) {
+        for (String s : computationGraphConfiguration.getNetworkOutputs()) {
             GraphVertex gv = verticesMap.get(s);
             gv.setOutputVertex(true);
         }
@@ -697,7 +699,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         // Safe when the input is: (a) it's not a graph input, and (b) isn't shared by any other layers/vertices
 
         Map<String,List<String>> seenAsInputTo = new HashMap<>();
-        for(Map.Entry<String,List<String>> entry : configuration.getVertexInputs().entrySet()){
+        for(Map.Entry<String,List<String>> entry : computationGraphConfiguration.getVertexInputs().entrySet()){
             for(String s : entry.getValue() ){
                 if (!seenAsInputTo.containsKey(s)) {
                     seenAsInputTo.put(s, new ArrayList<String>());
@@ -709,10 +711,10 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
         for(Layer l : layers){
             String layerName = l.conf().getLayer().getLayerName();
-            List<String> inputs = configuration.getVertexInputs().get(layerName);
+            List<String> inputs = computationGraphConfiguration.getVertexInputs().get(layerName);
             String in = inputs.get(0);  //For now: layers should have exactly 1 input
 
-            if(configuration.getNetworkInputs().contains(in)){
+            if(computationGraphConfiguration.getNetworkInputs().contains(in)){
                 //TODO When is it safe to NOT allow input modifucation? It's not always safe...
                 // For example dropout + iterating over List<MultiDataSet> that is used for multiple epochs...
                 continue;
@@ -761,10 +763,10 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             long numParams = 0;
             long[] numParamsForVertex = new long[topologicalOrder.length];
             int i = 0;
-            for (; i < configuration.getNetworkInputs().size(); i++) {
+            for (; i < computationGraphConfiguration.getNetworkInputs().size(); i++) {
                 numParamsForVertex[i] = 0; //No parameters for input vertices
             }
-            Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> configVertexMap = configuration.getVertices();
+            Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> configVertexMap = computationGraphConfiguration.getVertices();
             for (; i < topologicalOrder.length; i++) {
                 String name = indices.getIdxToName().get(i);
                 org.deeplearning4j.nn.conf.graph.GraphVertex n = configVertexMap.get(name);
@@ -796,7 +798,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         if(outputLayerIdxs == null) {
             outputLayerIdxs = new int[numOutputArrays];
             int i = 0;
-            for (String s : configuration.getNetworkOutputs()) {
+            for (String s : computationGraphConfiguration.getNetworkOutputs()) {
                 outputLayerIdxs[i++] = verticesMap.get(s).getVertexIndex();
             }
         }
@@ -875,7 +877,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Pretrain a specified layer with the given DataSetIterator
      *
-     * @param layerName       Layer name
+     * @param layerName       ILayer name
      * @param dataSetIterator Data
      */
     public void pretrainLayer(String layerName, DataSetIterator dataSetIterator) {
@@ -890,7 +892,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Pretrain a specified layer with the given MultiDataSetIterator
      *
-     * @param layerName Layer name
+     * @param layerName ILayer name
      * @param iter      Training data
      */
     public void pretrainLayer(String layerName, MultiDataSetIterator iter) {
@@ -920,7 +922,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         int idx = toTrain.getVertexIndex();
 
         LayerWorkspaceMgr workspaceMgr;
-        if(configuration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
+        if(computationGraphConfiguration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
             workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
             workspaceMgr = LayerWorkspaceMgr.builder()
@@ -1133,7 +1135,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         update(TaskUtils.buildTask(inputs, labels));
 
         LayerWorkspaceMgr workspaceMgr;
-        if(configuration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
+        if(computationGraphConfiguration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
             workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
             workspaceMgr = LayerWorkspaceMgr.builder()
@@ -1151,7 +1153,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
         workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
 
-        if (configuration.getBackpropType() == BackpropType.TruncatedBPTT) {
+        if (computationGraphConfiguration.getBackpropType() == BackpropType.TruncatedBPTT) {
             doTruncatedBPTT(inputs, labels, featureMaskArrays, labelMaskArrays, workspaceMgr);
         } else {
             if (solver == null) {
@@ -1202,9 +1204,9 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
 
         //Get cached topological sort order from config, if present
-        if(configuration.getTopologicalOrder() != null && configuration.getTopologicalOrderStr() != null){
-            int[] t = configuration.getTopologicalOrder();
-            List<String> s = configuration.getTopologicalOrderStr();
+        if(computationGraphConfiguration.getTopologicalOrder() != null && computationGraphConfiguration.getTopologicalOrderStr() != null){
+            int[] t = computationGraphConfiguration.getTopologicalOrder();
+            List<String> s = computationGraphConfiguration.getTopologicalOrderStr();
             Map<String,Integer> m1 = new HashMap<>();
             Map<Integer,String> m2 = new HashMap<>();
             for( int i=0; i<t.length; i++ ){
@@ -1222,9 +1224,9 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
 
         //https://en.wikipedia.org/wiki/Topological_sorting#Kahn.27s_algorithm
-        Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> nodeMap = configuration.getVertices();
-        List<String> networkInputNames = configuration.getNetworkInputs();
-        int numVertices = networkInputNames.size() + configuration.getVertices().size();
+        Map<String, org.deeplearning4j.nn.conf.graph.GraphVertex> nodeMap = computationGraphConfiguration.getVertices();
+        List<String> networkInputNames = computationGraphConfiguration.getNetworkInputs();
+        int numVertices = networkInputNames.size() + computationGraphConfiguration.getVertices().size();
         int[] out = new int[numVertices];
         int outCounter = 0;
 
@@ -1233,7 +1235,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         Map<Integer, String> vertexNamesMap = new HashMap<>();
         Map<String, Integer> vertexNamesMap2 = new HashMap<>();
         int i = 0;
-        for (String inputName : configuration.getNetworkInputs()) {
+        for (String inputName : computationGraphConfiguration.getNetworkInputs()) {
             vertexNamesMap.put(i, inputName);
             vertexNamesMap2.put(inputName, i);
             i++;
@@ -1248,7 +1250,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         Map<Integer, Set<Integer>> inputEdges = new HashMap<>(); //key: vertex. Values: vertices that the key vertex receives input from
         Map<Integer, Set<Integer>> outputEdges = new HashMap<>(); //key: vertex. Values: vertices that the key vertex outputs to
 
-        for (String s : configuration.getNetworkInputs()) {
+        for (String s : computationGraphConfiguration.getNetworkInputs()) {
             int idx = vertexNamesMap2.get(s);
             inputEdges.put(idx, null);
         }
@@ -1256,7 +1258,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         for (Map.Entry<String, org.deeplearning4j.nn.conf.graph.GraphVertex> entry : nodeMap.entrySet()) {
             String thisVertexName = entry.getKey();
             int idx = vertexNamesMap2.get(thisVertexName);
-            List<String> inputsToThisVertex = configuration.getVertexInputs().get(thisVertexName);
+            List<String> inputsToThisVertex = computationGraphConfiguration.getVertexInputs().get(thisVertexName);
 
             if (inputsToThisVertex == null || inputsToThisVertex.isEmpty()) {
                 inputEdges.put(idx, null);
@@ -1324,8 +1326,8 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         for( int idx : out){
             s.add(vertexNamesMap.get(idx));
         }
-        configuration.setTopologicalOrder(out);
-        configuration.setTopologicalOrderStr(s);
+        computationGraphConfiguration.setTopologicalOrder(out);
+        computationGraphConfiguration.setTopologicalOrderStr(s);
 
         graphIndices = GraphIndices.builder()
                 .topologicalSortOrder(out)
@@ -1344,7 +1346,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         synchronizeIterEpochCounts();
 
         LayerWorkspaceMgr workspaceMgr;
-        if(configuration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
+        if(computationGraphConfiguration.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
             workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
             workspaceMgr = LayerWorkspaceMgr.builder()
@@ -1362,7 +1364,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
         workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
 
-        boolean tbptt = configuration.getBackpropType() == BackpropType.TruncatedBPTT;
+        boolean tbptt = computationGraphConfiguration.getBackpropType() == BackpropType.TruncatedBPTT;
         FwdPassType fwdType = (tbptt ? FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE : FwdPassType.STANDARD);
         synchronizeIterEpochCounts();
 
@@ -1386,7 +1388,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
             score = 0.0;
             int outNum = 0;
-            for (String s : configuration.getNetworkOutputs()) {
+            for (String s : computationGraphConfiguration.getNetworkOutputs()) {
                 GraphVertex gv = verticesMap.get(s);
                 if(gv instanceof LayerVertex) {
                     //At this point: the input to the output layer might not be set on the layer itself - just the vertex
@@ -1863,7 +1865,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         int[] layerNums = new int[layers.size()];
         for( int i=0; i<layers.size(); i++ ){
             String n = layers.get(i);
-            Preconditions.checkState(verticesMap.containsKey(n), "Layer with name %s not found in network", n);
+            Preconditions.checkState(verticesMap.containsKey(n), "ILayer with name %s not found in network", n);
             layerNums[i] = verticesMap.get(n).getVertexIndex();
         }
         INDArray[] out = outputOfLayersDetached(train, FwdPassType.STANDARD, layerNums, features, featureMasks, null, true,
@@ -1920,7 +1922,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active before call to ffToLayerActivationsDetached", true);
 
         LayerWorkspaceMgr workspaceMgr;
-        WorkspaceMode wsm = (train ? configuration.getTrainingWorkspaceMode() : configuration.getInferenceWorkspaceMode());
+        WorkspaceMode wsm = (train ? computationGraphConfiguration.getTrainingWorkspaceMode() : computationGraphConfiguration.getInferenceWorkspaceMode());
         if (wsm == WorkspaceMode.NONE) {
             workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
@@ -1942,7 +1944,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
         //Add the inputs:
         for( int i=0; i<features.length; i++){
-            activations.put(configuration.getNetworkInputs().get(i), features[i]);
+            activations.put(computationGraphConfiguration.getNetworkInputs().get(i), features[i]);
         }
 
         boolean traceLog = log.isTraceEnabled();
@@ -1974,7 +1976,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                         out = current.doForward(train, workspaceMgr);
                     } else if(fwdPassType == FwdPassType.RNN_TIMESTEP){
                         if (current.hasLayer()) {
-                            //Layer
+                            //ILayer
                             INDArray input = current.getInputs()[0];
                             Layer l = current.getLayer();
                             if (l instanceof RecurrentLayer) {
@@ -2071,7 +2073,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         setLayerMaskArrays(fMask, lMask);
 
         LayerWorkspaceMgr workspaceMgr;
-        WorkspaceMode wsm = (train ? configuration.getTrainingWorkspaceMode() : configuration.getInferenceWorkspaceMode());
+        WorkspaceMode wsm = (train ? computationGraphConfiguration.getTrainingWorkspaceMode() : computationGraphConfiguration.getInferenceWorkspaceMode());
         if(wsm == WorkspaceMode.NONE){
             //Verify that no workspace is open externally
             WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in ffToLayerActivationsDetached", true);
@@ -2092,7 +2094,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                 workspaceMgr.setNoLeverageOverride(input[0].data().getParentWorkspace().getId());
             }
 
-            if(configuration.getCacheMode() != CacheMode.NONE){
+            if(computationGraphConfiguration.getCacheMode() != CacheMode.NONE){
                 //For now: store cache mode activations in activations workspace
                 workspaceMgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
             }
@@ -2272,7 +2274,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         List<LayerWorkspaceMgr> freeWorkspaceManagers = new ArrayList<>();  //Basically used as a stack
         Map<MemoryWorkspace, LayerWorkspaceMgr> openActivationsWorkspaces = new IdentityHashMap<>();
 
-        WorkspaceMode wsm = (train ? configuration.getTrainingWorkspaceMode() : configuration.getInferenceWorkspaceMode());
+        WorkspaceMode wsm = (train ? computationGraphConfiguration.getTrainingWorkspaceMode() : computationGraphConfiguration.getInferenceWorkspaceMode());
         boolean noWS = wsm == WorkspaceMode.NONE;
         LayerWorkspaceMgr allNone = noWS ? LayerWorkspaceMgr.noWorkspaces(helperWorkspaces) : null;
         List<MemoryWorkspace>[] closeAtEndIteraton = (List<MemoryWorkspace>[])new List[topologicalOrder.length];
@@ -2438,7 +2440,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                             }
                         } else if (fwdPassType == FwdPassType.RNN_TIMESTEP) {
                             if (current.hasLayer()) {
-                                //Layer
+                                //ILayer
                                 INDArray input = current.getInputs()[0];
                                 Layer l = current.getLayer();
                                 if (l instanceof RecurrentLayer) {
@@ -2562,7 +2564,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
 
         try {
-            calcBackpropGradients(true, configuration.getBackpropType() == BackpropType.TruncatedBPTT, epsilons);
+            calcBackpropGradients(true, computationGraphConfiguration.getBackpropType() == BackpropType.TruncatedBPTT, epsilons);
             return gradient;
         } catch (OutOfMemoryError e){
             CrashReportingUtil.writeMemoryCrashDump(this, e);
@@ -2595,19 +2597,19 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             consumed by all layers
          */
 
-        if(externalEpsilons == null || externalEpsilons.length == 0 && configuration.getTrainingWorkspaceMode() != WorkspaceMode.NONE){
+        if(externalEpsilons == null || externalEpsilons.length == 0 && computationGraphConfiguration.getTrainingWorkspaceMode() != WorkspaceMode.NONE){
             WorkspaceUtils.assertOpenAndActive(WS_ALL_LAYERS_ACT, "Expected workspace WS_ALL_LAYERS_ACT to be active and open" +
                     " in calcBackpropGradients when workspace mode is not set to NONE");
         }
 
         //Validate the network configuration for external errors - no output layers
         if(externalEpsilons != null && externalEpsilons.length > 0){
-            List<String> outputLayers = configuration.getNetworkOutputs();
+            List<String> outputLayers = computationGraphConfiguration.getNetworkOutputs();
             for(String s : outputLayers ){
                 GraphVertex gv = getVertex(s);
                 if(gv instanceof LayerVertex && gv.getLayer() instanceof IOutputLayer){
                     throw new IllegalStateException("Cannot perform backprop with external errors in conjunction with an output layer:" +
-                            " output layers cannot use external errors for backprop. Layer name: " + s);
+                            " output layers cannot use external errors for backprop. ILayer name: " + s);
                 }
             }
 
@@ -2643,7 +2645,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
 
 
-        boolean noWS = configuration.getInferenceWorkspaceMode() == WorkspaceMode.NONE;
+        boolean noWS = computationGraphConfiguration.getInferenceWorkspaceMode() == WorkspaceMode.NONE;
         LayerWorkspaceMgr allNone = noWS ? LayerWorkspaceMgr.noWorkspaces(helperWorkspaces) : null;
 
         List<LayerWorkspaceMgr> allWorkspaceManagers = new ArrayList<>();
@@ -2722,7 +2724,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                     //(a) it's an output layer (i.e., instanceof IOutputLayer), or
                     //(b) it's a normal layer, but it has been marked as an output layer for use in external errors - for reinforcement learning, for example
 
-                    int thisOutputNumber = configuration.getNetworkOutputs().indexOf(current.getVertexName());
+                    int thisOutputNumber = computationGraphConfiguration.getNetworkOutputs().indexOf(current.getVertexName());
                     Layer currentLayer = current.getLayer();
                     if (currentLayer instanceof FrozenLayerWithBackprop) {
                         currentLayer = ((FrozenLayerWithBackprop) currentLayer).getInsideLayer();
@@ -2735,7 +2737,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                     } else {
                         if ((externalEpsilons == null || externalEpsilons.length == 0)
                                 && labels[thisOutputNumber] != null) {
-                            throw new DL4JException("Layer \"" + current.getVertexName() + "\" of type "
+                            throw new DL4JException("ILayer \"" + current.getVertexName() + "\" of type "
                                     + current.getLayer().getClass().getSimpleName()
                                     + " is set as network output "
                                     + "(but isn't an IOutputLayer). Only IOutputLayer layers can be fit via backprop with"
@@ -2882,7 +2884,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
     @Override
     public ComputationGraph clone() {
-        ComputationGraph cg = new ComputationGraph(configuration.clone());
+        ComputationGraph cg = new ComputationGraph(computationGraphConfiguration.clone());
         cg.init(params().dup(), false);
         if (solver != null) {
             //If  solver is null: updater hasn't been initialized -> getUpdater call will force initialization, however
@@ -3019,7 +3021,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         if (outputLayerIdx >= numOutputArrays)
             throw new IllegalArgumentException("Invalid index: cannot get output layer " + outputLayerIdx
                     + ", total number of network outputs = " + numOutputArrays);
-        return getLayer(configuration.getNetworkOutputs().get(outputLayerIdx));
+        return getLayer(computationGraphConfiguration.getNetworkOutputs().get(outputLayerIdx));
     }
 
     /**
@@ -3086,7 +3088,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
     private double scoreHelper(MultiDataSet dataSet, boolean training){
         LayerWorkspaceMgr mgr;
-        WorkspaceMode wsm = (training ? configuration.getTrainingWorkspaceMode() : configuration.getInferenceWorkspaceMode());
+        WorkspaceMode wsm = (training ? computationGraphConfiguration.getTrainingWorkspaceMode() : computationGraphConfiguration.getInferenceWorkspaceMode());
         if(wsm == WorkspaceMode.NONE){
             mgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
@@ -3120,7 +3122,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             double r = calcRegularizationScore(true);
 
             int i = 0;
-            for (String s : configuration.getNetworkOutputs()) {
+            for (String s : computationGraphConfiguration.getNetworkOutputs()) {
                 GraphVertex gv = verticesMap.get(s);
                 Layer outLayer = gv.getLayer();
                 if (outLayer == null || !(outLayer instanceof IOutputLayer)) {
@@ -3180,7 +3182,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
     private INDArray scoreExamplesHelper(MultiDataSet dataSet, boolean addRegularizationTerms){
         LayerWorkspaceMgr mgr;
-        if(configuration.getInferenceWorkspaceMode() == WorkspaceMode.NONE){
+        if(computationGraphConfiguration.getInferenceWorkspaceMode() == WorkspaceMode.NONE){
             mgr = LayerWorkspaceMgr.noWorkspaces();
         } else {
             mgr = LayerWorkspaceMgr.builder()
@@ -3212,7 +3214,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
             double r = (addRegularizationTerms ? calcRegularizationScore(true) : 0.0);
             int i = 0;
-            for (String s : configuration.getNetworkOutputs()) {
+            for (String s : computationGraphConfiguration.getNetworkOutputs()) {
                 GraphVertex gv = verticesMap.get(s);
                 Layer outLayer = gv.getLayer();
                 if (outLayer == null || !(outLayer instanceof IOutputLayer)) {
@@ -3640,7 +3642,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         }
         if (l == null || !(l instanceof RecurrentLayer)) {
             throw new UnsupportedOperationException(
-                    "Layer \"" + layerName + "\" is not a recurrent layer. Cannot set state");
+                    "ILayer \"" + layerName + "\" is not a recurrent layer. Cannot set state");
         }
         ((RecurrentLayer) l).rnnSetPreviousState(state);
     }
@@ -3704,7 +3706,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             }
         }
 
-        long fwdLen = configuration.getTbpttFwdLength();
+        long fwdLen = computationGraphConfiguration.getTbpttFwdLength();
         long nSubsets = timeSeriesLength / fwdLen;
         if (timeSeriesLength % fwdLen != 0)
             nSubsets++;
@@ -3882,7 +3884,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                     // This output doesn't have a mask, we can skip it.
                     continue;
                 }
-                String outputName = configuration.getNetworkOutputs().get(i);
+                String outputName = computationGraphConfiguration.getNetworkOutputs().get(i);
                 GraphVertex v = verticesMap.get(outputName);
                 Layer ol = v.getLayer();
                 ol.setMaskArray(labelMaskArrays[i]);
@@ -3972,7 +3974,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             labelsList = iterator.getLabels();
 
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), Evaluation.class);
         }
 
@@ -3990,7 +3992,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public <T extends Evaluation> T evaluate(MultiDataSetIterator iterator, List<String> labelsList, int topN) {
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), Evaluation.class);
         }
         return (T)doEvaluation(iterator, new org.deeplearning4j.eval.Evaluation(labelsList, topN))[0];
@@ -4055,7 +4057,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public <T extends ROC> T evaluateROC(DataSetIterator iterator, int rocThresholdSteps) {
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROC.class);
         }
         return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROC(rocThresholdSteps))[0];
@@ -4078,7 +4080,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public <T extends ROC> T evaluateROC(MultiDataSetIterator iterator, int rocThresholdSteps) {
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROC.class);
         }
         return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROC(rocThresholdSteps))[0];
@@ -4101,7 +4103,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public <T extends ROCMultiClass> T evaluateROCMultiClass(DataSetIterator iterator, int rocThresholdSteps) {
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROCMultiClass.class);
         }
         return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROCMultiClass(rocThresholdSteps))[0];
@@ -4116,7 +4118,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public <T extends ROCMultiClass> T evaluateROCMultiClass(MultiDataSetIterator iterator, int rocThresholdSteps) {
         Layer outputLayer = getOutputLayer(0);
-        if(getConfiguration().isValidateOutputLayerConfig()){
+        if(this.getComputationGraphConfiguration().isValidateOutputLayerConfig()){
             OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROCMultiClass.class);
         }
         return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROCMultiClass(rocThresholdSteps))[0];
@@ -4202,13 +4204,13 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         MultiDataSetIterator iter =
                 iterator.asyncSupported() ? new AsyncMultiDataSetIterator(iterator, 2, true) : iterator;
 
-        WorkspaceMode cMode = configuration.getTrainingWorkspaceMode();
-        configuration.setTrainingWorkspaceMode(configuration.getInferenceWorkspaceMode());
+        WorkspaceMode cMode = computationGraphConfiguration.getTrainingWorkspaceMode();
+        computationGraphConfiguration.setTrainingWorkspaceMode(computationGraphConfiguration.getInferenceWorkspaceMode());
 
-        boolean useRnnSegments = (configuration.getBackpropType() == BackpropType.TruncatedBPTT);
+        boolean useRnnSegments = (computationGraphConfiguration.getBackpropType() == BackpropType.TruncatedBPTT);
 
         MemoryWorkspace outputWs;
-        if(getConfiguration().getInferenceWorkspaceMode() == WorkspaceMode.ENABLED){
+        if(this.getComputationGraphConfiguration().getInferenceWorkspaceMode() == WorkspaceMode.ENABLED){
             outputWs = Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread(WS_ALL_LAYERS_ACT_CONFIG, WS_OUTPUT_MEM);
         } else {
             outputWs = new DummyWorkspace();
@@ -4256,7 +4258,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             } else {
                 rnnClearPreviousState();
 
-                int fwdLen = configuration.getTbpttFwdLength();
+                int fwdLen = computationGraphConfiguration.getTbpttFwdLength();
                 long tsLength = -1;
                 long nF = next.getFeatures().length;
                 for (int i = 0; i < nF; i++) {
@@ -4309,7 +4311,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         if (iterator.asyncSupported())
             ((AsyncMultiDataSetIterator) iter).shutdown();
 
-        configuration.setTrainingWorkspaceMode(cMode);
+        computationGraphConfiguration.setTrainingWorkspaceMode(cMode);
 
         return evaluations;
     }
@@ -4380,9 +4382,9 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
             String out = "-";
             String paramShape = "-";
             if (currentVertex.isInputVertex()) {
-                if (inputTypes != null) vertexOutputs.put(currentVertexName, inputTypes[configuration.getNetworkInputs().indexOf(currentVertexName)]); //for input vertices the outputs are just the input types (only layer vertices have preprocessing?)
+                if (inputTypes != null) vertexOutputs.put(currentVertexName, inputTypes[computationGraphConfiguration.getNetworkInputs().indexOf(currentVertexName)]); //for input vertices the outputs are just the input types (only layer vertices have preprocessing?)
             } else {
-                connections = configuration.getVertexInputs().get(currentVertexName).toString();
+                connections = computationGraphConfiguration.getVertexInputs().get(currentVertexName).toString();
                 List<InputType> inputTypeList = new ArrayList<>();
                 if (currentVertex.hasLayer()) {
                     Layer currentLayer = currentVertex.getLayer();
@@ -4425,7 +4427,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                         inShape = currentInType.toString();
                         inputTypeList.add(currentInType);
 
-                        InputPreProcessor layerVertexPreProcesor = ((org.deeplearning4j.nn.conf.graph.LayerVertex)configuration.getVertices().get(currentVertexName)).getPreProcessor();
+                        InputPreProcessor layerVertexPreProcesor = ((org.deeplearning4j.nn.conf.graph.LayerVertex) computationGraphConfiguration.getVertices().get(currentVertexName)).getPreProcessor();
                         if (layerVertexPreProcesor != null) {
                             inShape += "-->" + layerVertexPreProcesor.getOutputType(currentInType);
                         }
@@ -4444,7 +4446,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                     }
                 }
                 if (inputTypes != null) {
-                    InputType currentVertexOutputType = configuration.getVertices().get(currentVertexName).getOutputType(currLayerIdx, inputTypeList.toArray(new InputType[inputTypeList.size()]));
+                    InputType currentVertexOutputType = computationGraphConfiguration.getVertices().get(currentVertexName).getOutputType(currLayerIdx, inputTypeList.toArray(new InputType[inputTypeList.size()]));
                     outShape = currentVertexOutputType.toString();
                     vertexOutputs.put(currentVertexName, currentVertexOutputType);
                 }
@@ -4546,14 +4548,14 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      * The current epoch count can be obtained using {@code ComputationGraph.getConfiguration().getEpochCount()}
      */
     public void incrementEpochCount(){
-        configuration.setEpochCount(configuration.getEpochCount() + 1);
+        computationGraphConfiguration.setEpochCount(computationGraphConfiguration.getEpochCount() + 1);
         synchronizeIterEpochCounts();
     }
 
     protected void synchronizeIterEpochCounts(){
         //TODO: this is necessrry for some schedules - but the redundant values are a little ugly...
-        int currIter = getConfiguration().getIterationCount();
-        int currEpoch = getConfiguration().getEpochCount();
+        int currIter = this.getComputationGraphConfiguration().getIterationCount();
+        int currEpoch = this.getComputationGraphConfiguration().getEpochCount();
         for(Layer l : layers){
             l.setIterationCount(currIter);
             l.setEpochCount(currEpoch);
@@ -4565,7 +4567,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      * @return Number of iterations
      */
     public int getIterationCount(){
-        return configuration.getIterationCount();
+        return computationGraphConfiguration.getIterationCount();
     }
 
     /**
@@ -4576,7 +4578,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      * @return Number of epochs
      */
     public int getEpochCount(){
-        return configuration.getEpochCount();
+        return computationGraphConfiguration.getEpochCount();
     }
 
     /**
@@ -4633,7 +4635,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
 
         try(MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
             INDArray newParams = params().castTo(dataType);
-            String jsonConfig = getConfiguration().toJson();
+            String jsonConfig = this.getComputationGraphConfiguration().toJson();
             ComputationGraphConfiguration newConf = ComputationGraphConfiguration.fromJson(jsonConfig);
             newConf.setDataType(dataType);
             ComputationGraph newNet = new ComputationGraph(newConf);
@@ -4714,7 +4716,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Get the current learning rate, for the specified layer, from the network.
      * Note: If the layer has no learning rate (no parameters, or an updater without a learning rate) then null is returned
-     * @param layerName   Layer name
+     * @param layerName   ILayer name
      * @return Learning rate for the specified layer, or null
      */
     public Double getLearningRate(String layerName){
@@ -4724,7 +4726,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Return the layer size (number of units) for the specified layer.
      * Note that the meaning of the "layer size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, recurrent layers: number of units (nOut configuration option)<br>
+     * - DenseLayerConfiguration, OutputLayer, recurrent layers: number of units (nOut configuration option)<br>
      * - ConvolutionLayer: the channels (number of channels)<br>
      * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
      *
@@ -4733,7 +4735,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public long layerSize(int layer) {
         if (layer < 0 || layer > layers.length) {
-            throw new IllegalArgumentException("Invalid layer index: " + layer + ". Layer index must be between 0 and "
+            throw new IllegalArgumentException("Invalid layer index: " + layer + ". ILayer index must be between 0 and "
                     + (layers.length - 1) + " inclusive");
         }
         return layerSize(layers[layer].conf().getLayer().getLayerName());
@@ -4742,7 +4744,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Return the input size (number of inputs) for the specified layer.<br>
      * Note that the meaning of the "input size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, etc: the feature vector size (nIn configuration option)<br>
+     * - DenseLayerConfiguration, OutputLayer, etc: the feature vector size (nIn configuration option)<br>
      * - Recurrent layers: the feature vector size <i>per time step</i> (nIn configuration option)<br>
      * - ConvolutionLayer: the channels (number of channels)<br>
      * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
@@ -4752,7 +4754,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
      */
     public long layerInputSize(int layer) {
         if (layer < 0 || layer > layers.length) {
-            throw new IllegalArgumentException("Invalid layer index: " + layer + ". Layer index must be between 0 and "
+            throw new IllegalArgumentException("Invalid layer index: " + layer + ". ILayer index must be between 0 and "
                     + (layers.length - 1) + " inclusive");
         }
         return layerInputSize(layers[layer].conf().getLayer().getLayerName());
@@ -4761,7 +4763,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Return the layer size (number of units) for the specified layer.<br>
      * Note that the meaning of the "layer size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, recurrent layers: number of units (nOut configuration option)<br>
+     * - DenseLayerConfiguration, OutputLayer, recurrent layers: number of units (nOut configuration option)<br>
      * - ConvolutionLayer: the channels (number of channels)<br>
      * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
      *
@@ -4785,7 +4787,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
     /**
      * Return the input size (number of inputs) for the specified layer.<br>
      * Note that the meaning of the "input size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, etc: the feature vector size (nIn configuration option)<br>
+     * - DenseLayerConfiguration, OutputLayer, etc: the feature vector size (nIn configuration option)<br>
      * - Recurrent layers: the feature vector size <i>per time step</i> (nIn configuration option)<br>
      * - ConvolutionLayer: the channels (number of channels)<br>
      * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
@@ -4860,7 +4862,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         if (obj instanceof ComputationGraph) {
             ComputationGraph network = (ComputationGraph) obj;
             boolean paramsEquals = network.params().equals(params());
-            boolean confEquals = getConfiguration().equals(network.getConfiguration());
+            boolean confEquals = this.getComputationGraphConfiguration().equals(network.getComputationGraphConfiguration());
             boolean updaterEquals = getUpdater().equals(network.getUpdater());
             return paramsEquals && confEquals && updaterEquals;
         }
@@ -4875,7 +4877,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
         val cg = ModelSerializer.restoreComputationGraph(ois, true);
 
         this.defaultConfiguration = cg.defaultConfiguration.clone();
-        this.configuration = cg.configuration.clone();
+        this.computationGraphConfiguration = cg.computationGraphConfiguration.clone();
         this.init();
         this.flattenedParams.assign(cg.flattenedParams);
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/BaseGraphVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/BaseGraphVertex.java
index afffe99d4..cdb124d75 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/BaseGraphVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/BaseGraphVertex.java
@@ -58,8 +58,8 @@ public abstract class BaseGraphVertex implements GraphVertex {
     protected INDArray[] inputs;
     protected INDArray epsilon;
 
-    //Set outputVertex to true when Layer is an OutputLayer, OR For use in specialized situations like reinforcement learning
-    // For RL situations, this Layer insn't an OutputLayer, but is the last layer in a graph, that gets its error/epsilon
+    //Set outputVertex to true when ILayer is an OutputLayer, OR For use in specialized situations like reinforcement learning
+    // For RL situations, this ILayer insn't an OutputLayer, but is the last layer in a graph, that gets its error/epsilon
     // passed in externally
     @Setter @Getter
     protected boolean outputVertex;
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/GraphVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/GraphVertex.java
index 73e4b2fc4..61136e0db 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/GraphVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/GraphVertex.java
@@ -40,7 +40,7 @@ public interface GraphVertex extends Trainable, Serializable {
     /** Get the index of the GraphVertex */
     int getVertexIndex();
 
-    /** Get the number of input arrays. For example, a Layer may have only one input array, but in general a GraphVertex
+    /** Get the number of input arrays. For example, a ILayer may have only one input array, but in general a GraphVertex
      * may have an arbtrary (>=1) number of input arrays (for example, from multiple other layers)
      */
     int getNumInputArrays();
@@ -85,7 +85,7 @@ public interface GraphVertex extends Trainable, Serializable {
     /** Set the GraphVertex to be an output vertex */
     void setOutputVertex(boolean outputVertex);
 
-    /** Get the Layer (if any). Returns null if {@link #hasLayer()} == false */
+    /** Get the ILayer (if any). Returns null if {@link #hasLayer()} == false */
     Layer getLayer();
 
     /** Set the input activations.
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/LayerVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/LayerVertex.java
index fdd05c390..60f3dad0b 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/LayerVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/LayerVertex.java
@@ -124,10 +124,10 @@ public class LayerVertex extends BaseGraphVertex {
     public Pair<Gradient, INDArray[]> doBackward(boolean tbptt, LayerWorkspaceMgr workspaceMgr) {
         if (!canDoBackward()) {
             if(inputs == null || inputs[0] == null){
-                throw new IllegalStateException("Cannot do backward pass: inputs not set. Layer: \"" + vertexName
+                throw new IllegalStateException("Cannot do backward pass: inputs not set. ILayer: \"" + vertexName
                         + "\" (idx " + vertexIndex + "), numInputs: " + getNumInputArrays());
             } else {
-                throw new IllegalStateException("Cannot do backward pass: all epsilons not set. Layer \"" + vertexName
+                throw new IllegalStateException("Cannot do backward pass: all epsilons not set. ILayer \"" + vertexName
                         + "\" (idx " + vertexIndex + "), numInputs :" + getNumInputArrays() + "; numOutputs: "
                         + getNumOutputConnections());
             }
@@ -142,7 +142,7 @@ public class LayerVertex extends BaseGraphVertex {
         if (tbptt && layer instanceof RecurrentLayer) {
             //Truncated BPTT for recurrent layers
             pair = ((RecurrentLayer) layer).tbpttBackpropGradient(epsilon,
-                            graph.getConfiguration().getTbpttBackLength(), workspaceMgr);
+                            graph.getComputationGraphConfiguration().getTbpttBackLength(), workspaceMgr);
         } else {
             //Normal backprop
             pair = layer.backpropGradient(epsilon, workspaceMgr); //epsTotal may be null for OutputLayers
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/DuplicateToTimeSeriesVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/DuplicateToTimeSeriesVertex.java
index 2bfc6ee97..27eb238d3 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/DuplicateToTimeSeriesVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/DuplicateToTimeSeriesVertex.java
@@ -48,10 +48,10 @@ public class DuplicateToTimeSeriesVertex extends BaseGraphVertex {
                     VertexIndices[] inputVertices, VertexIndices[] outputVertices, String inputName, DataType dataType) {
         super(graph, name, vertexIndex, inputVertices, outputVertices, dataType);
         this.inputName = inputName;
-        this.inputVertexIndex = graph.getConfiguration().getNetworkInputs().indexOf(inputName);
+        this.inputVertexIndex = graph.getComputationGraphConfiguration().getNetworkInputs().indexOf(inputName);
         if (inputVertexIndex == -1)
             throw new IllegalArgumentException("Invalid input name: \"" + inputName + "\" not found in list "
-                            + "of network inputs (" + graph.getConfiguration().getNetworkInputs() + ")");
+                            + "of network inputs (" + graph.getComputationGraphConfiguration().getNetworkInputs() + ")");
     }
 
     @Override
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/LastTimeStepVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/LastTimeStepVertex.java
index 0475936d0..4402dc4c5 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/LastTimeStepVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/LastTimeStepVertex.java
@@ -54,10 +54,10 @@ public class LastTimeStepVertex extends BaseGraphVertex {
                     VertexIndices[] outputVertices, String inputName, DataType dataType) {
         super(graph, name, vertexIndex, inputVertices, outputVertices, dataType);
         this.inputName = inputName;
-        this.inputIdx = graph.getConfiguration().getNetworkInputs().indexOf(inputName);
+        this.inputIdx = graph.getComputationGraphConfiguration().getNetworkInputs().indexOf(inputName);
         if (inputIdx == -1)
             throw new IllegalArgumentException("Invalid input name: \"" + inputName + "\" not found in list "
-                            + "of network inputs (" + graph.getConfiguration().getNetworkInputs() + ")");
+                            + "of network inputs (" + graph.getComputationGraphConfiguration().getNetworkInputs() + ")");
     }
 
     @Override
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/ReverseTimeSeriesVertex.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/ReverseTimeSeriesVertex.java
index 359a576a3..86b5dcab3 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/ReverseTimeSeriesVertex.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/graph/vertex/impl/rnn/ReverseTimeSeriesVertex.java
@@ -48,10 +48,10 @@ public class ReverseTimeSeriesVertex extends BaseGraphVertex {
             this.inputIdx = -1;
         } else {
             // Find the given input
-            this.inputIdx = graph.getConfiguration().getNetworkInputs().indexOf(inputName);
+            this.inputIdx = graph.getComputationGraphConfiguration().getNetworkInputs().indexOf(inputName);
             if (inputIdx == -1)
                 throw new IllegalArgumentException("Invalid input name: \"" + inputName + "\" not found in list "
-                        + "of network inputs (" + graph.getConfiguration().getNetworkInputs() + ")");
+                        + "of network inputs (" + graph.getComputationGraphConfiguration().getNetworkInputs() + ")");
         }
     }
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
index 3ad4f8b0a..fa03d3c51 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
@@ -79,7 +79,7 @@ public class LSTMHelpers {
                                                ) {
 
         //Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
-        //Data has shape [m,nIn,T]. Layer activations/output has shape [m,nHiddenUnits,T]
+        //Data has shape [m,nIn,T]. ILayer activations/output has shape [m,nHiddenUnits,T]
         if (input == null || input.length() == 0)
             throw new IllegalArgumentException("Invalid input: not set or 0 length");
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java
index 18397bd4d..0f81392f9 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java
@@ -26,6 +26,8 @@ import lombok.NonNull;
 import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
+import net.brutex.ai.dnn.api.INeuralNetwork;
+import net.brutex.ai.dnn.networks.ArtificialNeuralNetwork;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.bytedeco.javacpp.Pointer;
@@ -38,9 +40,7 @@ import org.deeplearning4j.nn.api.layers.IOutputLayer;
 import org.deeplearning4j.nn.api.layers.RecurrentLayer;
 import org.deeplearning4j.nn.conf.*;
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
 import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
-import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
 import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
@@ -99,1097 +99,1235 @@ import org.nd4j.common.util.OneTimeLogger;
 import java.io.*;
 import java.util.*;
 
-
+/**
+ * Artificial Neural Network An artificial neural network (1) takes some input data, and (2)
+ * transforms this input data by calculating a weighted sum over the inputs and (3) applies a
+ * non-linear function to this transformation to calculate an intermediate state. The three steps
+ * above constitute what is known as a layer, and the transformative function is often referred to
+ * as a unit. The intermediate states—often termed features—are used as the input into another
+ * layer.
+ * <p>
+ * Through repetition of these steps, the artificial neural network learns multiple layers of
+ * non-linear features, which it then combines in a final layer to create a prediction.
+ * <p>
+ * The neural network learns by generating an error signal that measures the difference between the
+ * predictions of the network and the desired values and then using this error signal to change the
+ * weights (or parameters) so that predictions get more accurate.
+ */
 @Slf4j
-public class MultiLayerNetwork implements Serializable, Classifier, Layer, org.deeplearning4j.nn.api.NeuralNetwork {
+public class MultiLayerNetwork extends ArtificialNeuralNetwork implements Serializable, Classifier, Layer,
+    INeuralNetwork {
 
-    //the hidden neural network layers (including output layer)
-    protected Layer[] layers;
-    protected LinkedHashMap<String, Layer> layerMap = new LinkedHashMap<>();
-
-    //Current training data: input features and labels
-    protected INDArray input, labels;
-
-    protected boolean initCalled = false;
-    protected Collection<TrainingListener> trainingListeners = new ArrayList<>();
-
-    protected NeuralNetConfiguration defaultConfiguration;
-    protected MultiLayerConfiguration layerWiseConfigurations;
-    protected Gradient gradient;
-    protected double score;
-    @Setter
-    protected boolean initDone = false;
-    protected INDArray flattenedParams; //Params for all layers are a view/subset of this array
-    @Getter
-    protected transient INDArray flattenedGradients; //Gradients for all layers are a view/subset of this array
-
-    protected boolean clearTbpttState = true;  //Mainly for unit testing (should be enabled otherwise)
-    protected transient ThreadLocal<Long> lastEtlTime = new ThreadLocal<>();
-    protected INDArray mask;
-
-    protected int layerIndex; //For Layer.get/setIndex()
-
-    protected transient Solver solver; //Used to call optimizers during backprop
-    //Workspaces for CUDNN. Pass to LayerWorkspaceMgr for re-use in cudnn helpers
-    @Getter
-    protected transient Map<String,Pointer> helperWorkspaces = new HashMap<>();
+  /**
+   * Workspace for working memory for a single layer: forward pass and backward pass Note that this
+   * is opened/closed once per op (activate/backpropGradient call)
+   */
+  protected static final String WS_LAYER_WORKING_MEM = "WS_LAYER_WORKING_MEM";
+  /**
+   * Workspace for storing all layers' activations - used only to store activations (layer inputs)
+   * as part of backprop Not used for inference
+   */
+  protected static final String WS_ALL_LAYERS_ACT = "WS_ALL_LAYERS_ACT";
+  /**
+   * Next 2 workspaces: used for: (a) Inference: holds activations for one layer only (b) Backprop:
+   * holds activation gradients for one layer only In both cases, they are opened and closed on
+   * every second layer
+   */
+  protected static final String WS_LAYER_ACT_1 = "WS_LAYER_ACT_1";
+  protected static final String WS_LAYER_ACT_2 = "WS_LAYER_ACT_2";
+  /**
+   * Workspace for output methods that use OutputAdapter
+   */
+  protected static final String WS_OUTPUT_MEM = "WS_OUTPUT_MEM";
+  /**
+   * Workspace for working memory in RNNs - opened and closed once per RNN time step
+   */
+  protected static final String WS_RNN_LOOP_WORKING_MEM = "WS_RNN_LOOP_WORKING_MEM";
+  protected static final WorkspaceConfiguration WS_ALL_LAYERS_ACT_CONFIG = WorkspaceConfiguration.builder()
+      .initialSize(0)
+      .overallocationLimit(0.05)
+      .policyLearning(LearningPolicy.FIRST_LOOP)
+      .policyReset(ResetPolicy.BLOCK_LEFT)
+      .policySpill(SpillPolicy.REALLOCATE)
+      .policyAllocation(AllocationPolicy.OVERALLOCATE)
+      .build();
+  protected static final WorkspaceConfiguration WS_RNN_LOOP_WORKING_MEM_CONFIG = WorkspaceConfiguration.builder()
+      .initialSize(0).overallocationLimit(0.05).policyReset(ResetPolicy.BLOCK_LEFT)
+      .policyAllocation(AllocationPolicy.OVERALLOCATE).policySpill(SpillPolicy.REALLOCATE)
+      .policyLearning(LearningPolicy.FIRST_LOOP).build();
+  //the hidden neural network layers (including output layer)
+  protected Layer[] layers;
+  protected LinkedHashMap<String, Layer> layerMap = new LinkedHashMap<>();
+  //Current training data: input features and labels
+  protected INDArray input, labels;
+  protected boolean initCalled = false;
+  protected Collection<TrainingListener> trainingListeners = new ArrayList<>();
+  protected NeuralNetConfiguration defaultConfiguration;
+  protected MultiLayerConfiguration layerWiseConfigurations;
+  protected Gradient gradient;
+  protected double score;
+  @Setter
+  protected boolean initDone = false;
+  protected INDArray flattenedParams; //Params for all layers are a view/subset of this array
+  @Getter
+  protected transient INDArray flattenedGradients; //Gradients for all layers are a view/subset of this array
+  protected boolean clearTbpttState = true;  //Mainly for unit testing (should be enabled otherwise)
+  protected transient ThreadLocal<Long> lastEtlTime = new ThreadLocal<>();
+  protected INDArray mask;
+  protected int layerIndex; //For Layer.get/setIndex()
+  protected transient Solver solver; //Used to call optimizers during backprop
+  //Workspaces for CUDNN. Pass to LayerWorkspaceMgr for re-use in cudnn helpers
+  @Getter
+  protected transient Map<String, Pointer> helperWorkspaces = new HashMap<>();
+  protected WorkspaceConfiguration WS_LAYER_WORKING_MEM_CONFIG;
+  protected WorkspaceConfiguration WS_LAYER_ACT_X_CONFIG;
 
 
-    /**
-     * Workspace for working memory for a single layer: forward pass and backward pass
-     * Note that this is opened/closed once per op (activate/backpropGradient call)
-     */
-    protected static final String WS_LAYER_WORKING_MEM = "WS_LAYER_WORKING_MEM";
-    /**
-     * Workspace for storing all layers' activations - used only to store activations (layer inputs) as part of backprop
-     * Not used for inference
-     */
-    protected static final String WS_ALL_LAYERS_ACT = "WS_ALL_LAYERS_ACT";
-    /**
-     * Next 2 workspaces: used for:
-     * (a) Inference: holds activations for one layer only
-     * (b) Backprop: holds activation gradients for one layer only
-     * In both cases, they are opened and closed on every second layer
-     */
-    protected static final String WS_LAYER_ACT_1 = "WS_LAYER_ACT_1";
-    protected static final String WS_LAYER_ACT_2 = "WS_LAYER_ACT_2";
+  public MultiLayerNetwork(MultiLayerConfiguration conf) {
+    this.layerWiseConfigurations = conf;
+    this.defaultConfiguration = conf.getConf(0).clone();
 
-    /**
-     * Workspace for output methods that use OutputAdapter
-     */
-    protected static final String WS_OUTPUT_MEM = "WS_OUTPUT_MEM";
+    //Working memory: should learn over course of: (a) full forward pass, and (b) full backward pass
+    //Working memory should be opened once per layer and once per preprocessor, for each of forward and backward passes
+    int numWorkingMem = 2 * (layerWiseConfigurations.getConfs().size()
+        + layerWiseConfigurations.getInputPreProcessors().size());
+    WS_LAYER_WORKING_MEM_CONFIG = getLayerWorkingMemWSConfig(numWorkingMem);
+    WS_LAYER_ACT_X_CONFIG = getLayerActivationWSConfig(layerWiseConfigurations.getConfs().size());
+  }
 
-    /**
-     * Workspace for working memory in RNNs - opened and closed once per RNN time step
-     */
-    protected static final String WS_RNN_LOOP_WORKING_MEM = "WS_RNN_LOOP_WORKING_MEM";
+  /**
+   * Initialize the network based on the configuration (a MultiLayerConfiguration in JSON format)
+   * and parameters array
+   *
+   * @param conf   the configuration json
+   * @param params the parameters for the network
+   */
+  public MultiLayerNetwork(String conf, INDArray params) {
+    this(MultiLayerConfiguration.fromJson(conf));
+    init();
+    setParameters(params);
+  }
 
+  /**
+   * Initialize the network based on the configuration and parameters array
+   *
+   * @param conf   the configuration
+   * @param params the parameters
+   */
+  public MultiLayerNetwork(MultiLayerConfiguration conf, INDArray params) {
+    this(conf);
+    init();
+    setParameters(params);
+  }
 
-    protected WorkspaceConfiguration WS_LAYER_WORKING_MEM_CONFIG;
+  protected static WorkspaceConfiguration getLayerWorkingMemWSConfig(int numWorkingMemCycles) {
+    return WorkspaceConfiguration.builder()
+        .initialSize(0)
+        .overallocationLimit(0.02)
+        .policyLearning(LearningPolicy.OVER_TIME)
+        .cyclesBeforeInitialization(numWorkingMemCycles)
+        .policyReset(ResetPolicy.BLOCK_LEFT)
+        .policySpill(SpillPolicy.REALLOCATE)
+        .policyAllocation(AllocationPolicy.OVERALLOCATE)
+        .build();
+  }
 
-    protected static final WorkspaceConfiguration WS_ALL_LAYERS_ACT_CONFIG = WorkspaceConfiguration.builder()
-            .initialSize(0)
-            .overallocationLimit(0.05)
-            .policyLearning(LearningPolicy.FIRST_LOOP)
-            .policyReset(ResetPolicy.BLOCK_LEFT)
-            .policySpill(SpillPolicy.REALLOCATE)
-            .policyAllocation(AllocationPolicy.OVERALLOCATE)
+  protected static WorkspaceConfiguration getLayerActivationWSConfig(int numLayers) {
+    //Activations memory: opened once per layer - for every second layer (preprocessors are within the loop).
+    //Technically we could set learning to numLayers / 2, but will set to numLayers for simplicity, and also to
+    // account for a backward pass
+    return WorkspaceConfiguration.builder()
+        .initialSize(0)
+        .overallocationLimit(0.02)
+        .policyLearning(LearningPolicy.OVER_TIME)
+        .cyclesBeforeInitialization(numLayers)
+        .policyReset(ResetPolicy.BLOCK_LEFT)
+        .policySpill(SpillPolicy.REALLOCATE)
+        .policyAllocation(AllocationPolicy.OVERALLOCATE)
+        .build();
+  }
+
+  /**
+   * Restore a MultiLayerNetwork to a file, saved using {@link #save(File)} or
+   * {@link ModelSerializer}
+   *
+   * @param f           File to load the network from
+   * @param loadUpdater If true: load the updater if it is available (i.e., the state array for
+   *                    momentum/Adam/rmsprop etc) - use <i>false</i> if no further training is
+   *                    required, or <i>true</i> if further training will be undertaken
+   * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
+   */
+  public static MultiLayerNetwork load(File f, boolean loadUpdater) throws IOException {
+    return ModelSerializer.restoreMultiLayerNetwork(f, loadUpdater);
+  }
+
+  /**
+   * This method sets specified CacheMode for all layers within network
+   *
+   * @param mode
+   */
+  public void setCacheMode(CacheMode mode) {
+    if (mode == null) {
+      mode = CacheMode.NONE;
+    }
+
+    for (Layer layer : layers) {
+      layer.setCacheMode(mode);
+    }
+  }
+
+  /**
+   * Get the last ETL time. This in informational, and is the amount of time in milliseconds that
+   * was required to obtain the last DataSet/MultiDataSet during fitting. A value consistently above
+   * 0 may indicate a data feeding bottleneck, or no asynchronous data prefetching (async prefetch
+   * is enabled by default)
+   *
+   * @return The last ETL time in milliseconds, if avaliable (or 0 if not)
+   */
+  public long getLastEtlTime() {
+    Long time = lastEtlTime.get();
+    return time == null ? 0L : time;
+  }
+
+  /**
+   * Set the last ETL time in milliseconds, for informational/reporting purposes. Generally used
+   * internally.
+   *
+   * @param time ETL time
+   */
+  public void setLastEtlTime(long time) {
+    lastEtlTime.set(time);
+  }
+
+  protected void intializeConfigurations() {
+    if (layerWiseConfigurations == null) {
+      layerWiseConfigurations = new MultiLayerConfiguration.Builder().build();
+    }
+
+    if (layers == null) {
+      layers = new Layer[getnLayers()];
+    }
+
+    if (defaultConfiguration == null) {
+      defaultConfiguration = new NeuralNetConfiguration.Builder().build();
+    }
+  }
+
+  /**
+   * Perform layerwise pretraining for one epoch - see {@link #pretrain(DataSetIterator, int)}
+   */
+  public void pretrain(DataSetIterator iter) {
+    pretrain(iter, 1);
+  }
+
+  /**
+   * Perform layerwise unsupervised training on all pre-trainable layers in the network (VAEs,
+   * Autoencoders, etc), for the specified number of epochs each. For example, if numEpochs=3, then
+   * layer 0 will be fit for 3 epochs, followed by layer 1 for 3 epochs, and so on.<br> Note that
+   * pretraining will be performed on one layer after the other. To perform unsupervised training on
+   * a single layer, use {@link #pretrainLayer(int, DataSetIterator)}
+   *
+   * @param iter Training data
+   */
+  public void pretrain(DataSetIterator iter, int numEpochs) {
+    if (flattenedGradients == null) {
+      initGradientsView();
+    }
+
+    for (int i = 0; i < getnLayers(); i++) {
+      pretrainLayer(i, iter, numEpochs);
+    }
+  }
+
+  /**
+   * Fit for one epoch - see {@link #pretrainLayer(int, DataSetIterator, int)}
+   */
+  public void pretrainLayer(int layerIdx, DataSetIterator iter) {
+    pretrainLayer(layerIdx, iter, 1);
+  }
+
+  /**
+   * Perform layerwise unsupervised training on a single pre-trainable layer in the network (VAEs,
+   * Autoencoders, etc) for the specified number of epochs<br> If the specified layer index (0 to
+   * numLayers - 1) is not a pretrainable layer, this is a no-op.
+   *
+   * @param layerIdx  Index of the layer to train (0 to numLayers-1)
+   * @param iter      Training data
+   * @param numEpochs Number of epochs to fit the specified layer for
+   */
+  public void pretrainLayer(int layerIdx, DataSetIterator iter, int numEpochs) {
+    Preconditions.checkState(numEpochs > 0, "Number of epochs (%s) must be a positive number",
+        numEpochs);
+
+    if (flattenedGradients == null) {
+      initGradientsView();
+    }
+    if (layerIdx >= layers.length) {
+      throw new IllegalArgumentException(
+          "Cannot pretrain layer: layerIdx (" + layerIdx + ") >= numLayers (" + layers.length
+              + ")");
+    }
+
+    Layer layer = layers[layerIdx];
+    if (!layer.isPretrainLayer()) {
+      return;
+    }
+
+    if (numEpochs > 1 && !iter.resetSupported()) {
+      throw new IllegalStateException("Cannot fit multiple epochs (" + numEpochs
+          + ") on an iterator that doesn't support resetting");
+    }
+
+    if (!iter.hasNext() && iter.resetSupported()) {
+      iter.reset();
+    }
+
+    log.info(
+        "Starting unsupervised training on layer " + layerIdx + " for " + numEpochs + " epochs");
+    for (int i = 0; i < numEpochs; i++) {
+      if (i > 0) {
+        iter.reset();
+      }
+
+      while (iter.hasNext()) {
+        DataSet next = iter.next();
+        input = next.getFeatures();
+        pretrainLayer(layerIdx, input);
+      }
+    }
+
+    int ec = getLayer(layerIdx).conf().getEpochCount() + 1;
+    getLayer(layerIdx).conf().setEpochCount(ec);
+  }
+
+  /**
+   * Perform layerwise unsupervised training on a single pre-trainable layer in the network (VAEs,
+   * Autoencoders, etc)<br> If the specified layer index (0 to numLayers - 1) is not a pretrainable
+   * layer, this is a no-op.
+   *
+   * @param layerIdx Index of the layer to train (0 to numLayers-1)
+   * @param features Training data array
+   */
+  public void pretrainLayer(int layerIdx, INDArray features) {
+    setInput(features);
+    setLayerMaskArrays(null, null);
+
+    if (flattenedGradients == null) {
+      initGradientsView();
+    }
+    if (layerIdx >= layers.length) {
+      throw new IllegalArgumentException(
+          "Cannot pretrain layer: layerIdx (" + layerIdx + ") >= numLayers (" + layers.length
+              + ")");
+    }
+
+    LayerWorkspaceMgr workspaceMgr;
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      workspaceMgr = LayerWorkspaceMgr.builder()
+          .defaultWorkspace(WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+    }
+    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    Layer layer = layers[layerIdx];
+    if (!layer.isPretrainLayer()) {
+      return;
+    }
+
+    //Do forward pass to the layer to be pretrained
+    INDArray outputOfPrevLayer;
+    if (layerIdx == 0) {
+      outputOfPrevLayer = input;
+    } else {
+      //Yes, this part of training - but we'll do forward psas as inference mode when doing layerwise training
+      // to effectively freeze earlier layers and not apply dropout etc
+      outputOfPrevLayer = outputOfLayerDetached(false, FwdPassType.STANDARD, layerIndex - 1,
+          features, null, null, null);
+    }
+
+    try (MemoryWorkspace ws = workspaceMgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
+      if (layerWiseConfigurations.getInputPreProcess(layerIdx) != null) {
+
+        if (input.size(0) > Integer.MAX_VALUE) {
+          throw new ND4JArraySizeException();
+        }
+        outputOfPrevLayer = layerWiseConfigurations.getInputPreProcess(layerIdx)
+            .preProcess(outputOfPrevLayer, (int) input.size(0),
+                LayerWorkspaceMgr.noWorkspaces(helperWorkspaces));
+      }
+
+      layer.fit(outputOfPrevLayer, workspaceMgr);
+    }
+  }
+
+  @Override
+  public int batchSize() {
+    //In 99+% of cases, the input and labels dimension 0 size should be identical
+    //The only real exceptions: space to batch, and batch to space layers
+    //In those cases, we should base it on the labels size, as this impacts gradient calculation
+    if (input.size(0) > Integer.MAX_VALUE || labels.size(0) > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    return labels == null ? (int) input.size(0) : (int) labels.size(0);
+  }
+
+  @Override
+  public NeuralNetConfiguration conf() {
+    return defaultConfiguration;
+  }
+
+  @Override
+  public void setConf(NeuralNetConfiguration conf) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public INDArray input() {
+    return input;
+  }
+
+  @Override
+  public ConvexOptimizer getOptimizer() {
+    return solver.getOptimizer();
+  }
+
+  /**
+   * Get one parameter array for the network.<br> In MultiLayerNetwork, parameters are keyed like
+   * "0_W" and "0_b" to mean "weights of layer index 0" and "biases of layer index 0" respectively.
+   * Numbers increment sequentially, and the suffixes ("W", "b" etc) depend on the layer type, and
+   * are defined in the relevant parameter initializers for each layer.<br> Note that the returned
+   * INDArrays are views of the underlying network parameters, so modifications of the returned
+   * arrays will impact the parameters of the network.
+   *
+   * @param param the key of the parameter
+   * @return The specified parameter array for the network
+   * @see #paramTable() paramTable() method, for a map of all parameters
+   */
+  @Override
+  public INDArray getParam(String param) {
+    //Get params for MultiLayerNetwork sub layers.
+    int idx = param.indexOf('_');
+    if (idx == -1) {
+      throw new IllegalStateException(
+          "Invalid param key: does not have layer separator: \"" + param + "\"");
+    }
+    int layerIdx = Integer.parseInt(param.substring(0, idx));
+    String newKey = param.substring(idx + 1);
+
+    return layers[layerIdx].getParam(newKey);
+  }
+
+  /**
+   * Return a map of all parameters in the network. Parameter names are as described in
+   * {@link #getParam(String)}. As per {@link #getParam(String)} the returned arrays are views -
+   * modifications to these will impact the underlying network parameters
+   *
+   * @return A map of all parameters in the network
+   */
+  @Override
+  public Map<String, INDArray> paramTable() {
+    return paramTable(false);
+  }
+
+  /**
+   * Returns a map of all parameters in the network as per {@link #paramTable()}.<br> Optionally
+   * (with backpropParamsOnly=true) only the 'backprop' parameters are returned - that is, any
+   * parameters involved only in unsupervised layerwise pretraining not standard inference/backprop
+   * are excluded from the returned list.
+   *
+   * @param backpropParamsOnly If true, return backprop params only. If false: return all params
+   * @return Parameters for the network
+   */
+  public Map<String, INDArray> paramTable(boolean backpropParamsOnly) {
+    //Get all parameters from all layers
+    Map<String, INDArray> allParams = new LinkedHashMap<>();
+    for (int i = 0; i < layers.length; i++) {
+      Map<String, INDArray> paramMap = layers[i].paramTable(backpropParamsOnly);
+      for (Map.Entry<String, INDArray> entry : paramMap.entrySet()) {
+        String newKey = i + "_" + entry.getKey();
+        allParams.put(newKey, entry.getValue());
+      }
+    }
+    return allParams;
+  }
+
+  /**
+   * Intended for internal use
+   */
+  @Override
+  public boolean updaterDivideByMinibatch(String paramName) {
+    int idx = paramName.indexOf('_');
+    int layerIdx = Integer.parseInt(paramName.substring(0, idx));
+    String subName = paramName.substring(idx + 1);
+    return getLayer(layerIdx).updaterDivideByMinibatch(subName);
+  }
+
+  /**
+   * Set the parameters of the netowrk. Note that the parameter keys must match the format as
+   * described in {@link #getParam(String)} and {@link #paramTable()}. Note that the values of the
+   * parameters used as an argument to this method are copied - i.e., it is safe to later
+   * modify/reuse the values in the provided paramTable without this impacting the network.
+   *
+   * @param paramTable Parameters to set
+   */
+  @Override
+  public void setParamTable(Map<String, INDArray> paramTable) {
+    Map<String, INDArray> currParamTable = paramTable();
+    if (!currParamTable.keySet().equals(paramTable.keySet())) {
+      throw new IllegalArgumentException(
+          "Cannot set param table: parameter keys do not match.\n" + "Current: "
+              + currParamTable.keySet() + "\nTo set: " + paramTable.keySet());
+    }
+
+    for (String s : paramTable.keySet()) {
+      INDArray curr = currParamTable.get(s);
+      INDArray toSet = paramTable.get(s);
+      if (!Arrays.equals(curr.shape(), toSet.shape())) {
+        throw new IllegalArgumentException(
+            "Cannot set parameter table: parameter \"" + s + "\" shapes "
+                + "do not match. Current = " + Arrays.toString(curr.shape()) + ", to set = "
+                + Arrays.toString(toSet.shape()));
+      }
+    }
+
+    //Now that we've checked ALL params (to avoid leaving net in half-modified state)
+    for (String s : paramTable.keySet()) {
+      INDArray curr = currParamTable.get(s);
+      INDArray toSet = paramTable.get(s);
+      curr.assign(toSet);
+    }
+  }
+
+  /**
+   * Set the values of a single parameter. See {@link #setParamTable(Map)} and
+   * {@link #getParam(String)} for more details.
+   *
+   * @param key the key of the parameter to set
+   * @param val the new values for the parameter
+   */
+  @Override
+  public void setParam(String key, INDArray val) {
+    //Set params for MultiLayerNetwork sub layers.
+    int idx = key.indexOf('_');
+    if (idx == -1) {
+      throw new IllegalStateException(
+          "Invalid param key: not have layer separator: \"" + key + "\"");
+    }
+    int layerIdx = Integer.parseInt(key.substring(0, idx));
+    String newKey = key.substring(idx + 1);
+
+    layers[layerIdx].setParam(newKey, val);
+  }
+
+  /**
+   * Get the configuration for the network
+   *
+   * @return Network configuration
+   */
+  public MultiLayerConfiguration getLayerWiseConfigurations() {
+    return layerWiseConfigurations;
+  }
+
+  /**
+   * This method is intended for internal/developer use only.
+   */
+  public void setLayerWiseConfigurations(MultiLayerConfiguration layerWiseConfigurations) {
+    this.layerWiseConfigurations = layerWiseConfigurations;
+  }
+
+  /**
+   * Initialize the MultiLayerNetwork. This should be called once before the network is used. This
+   * is functionally equivalent to calling {@code init(null, false)}.
+   *
+   * @see MultiLayerNetwork#init(INDArray, boolean)
+   */
+  public void init() {
+    init(null, false);
+  }
+
+  /**
+   * Initialize the MultiLayerNetwork, optionally with an existing parameters array. If an existing
+   * parameters array is specified, it will be used (and the values will not be modified) in the
+   * network; if no parameters array is specified, parameters will be initialized randomly according
+   * to the network configuration.
+   *
+   * @param parameters           Network parameter. May be null. If null: randomly initialize.
+   * @param cloneParametersArray Whether the parameter array (if any) should be cloned, or used
+   *                             directly
+   */
+  public void init(INDArray parameters, boolean cloneParametersArray) {
+    if (layerWiseConfigurations == null || layers == null) {
+      intializeConfigurations();
+    }
+    if (initCalled) {
+      return;
+    }
+
+    DataType netDtype = getLayerWiseConfigurations().getDataType();
+    if (parameters != null && parameters.dataType() != netDtype) {
+      Preconditions.checkState(parameters.rank() == 2 && parameters.size(0) == 1,
+          "Invalid parameters array: should be rank 2 with shape [1,numParams]. Got %ndShape",
+          parameters);
+      if (cloneParametersArray) {
+        try (MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
+          parameters = parameters.castTo(netDtype);
+        }
+      } else {
+        throw new IllegalStateException(
+            "Error initializing network: Network datatype is set to " + netDtype
+                + " but provided array has datatype " + parameters.dataType()
+                + " with cloneParametersArray argument" +
+                " set to false. Cannot initialize net with specified datatype array if that array does not match network datatype");
+      }
+    }
+
+    if (layerMap == null) {
+      layerMap = new LinkedHashMap<>();
+    }
+
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == null) {
+      layerWiseConfigurations.setTrainingWorkspaceMode(WorkspaceMode.NONE);
+    }
+
+    if (layerWiseConfigurations.getInferenceWorkspaceMode() == null) {
+      layerWiseConfigurations.setInferenceWorkspaceMode(WorkspaceMode.NONE);
+    }
+
+    if (layerWiseConfigurations.getCacheMode() == null) {
+      layerWiseConfigurations.setCacheMode(CacheMode.NONE);
+    }
+
+    OneTimeLogger.info(log,
+        "Starting MultiLayerNetwork with WorkspaceModes set to [training: {}; inference: {}], cacheMode set to [{}]",
+        layerWiseConfigurations.getTrainingWorkspaceMode(),
+        layerWiseConfigurations.getInferenceWorkspaceMode(),
+        layerWiseConfigurations.getCacheMode());
+
+    int nLayers = getnLayers();
+
+    if (nLayers < 1) {
+      throw new IllegalStateException("Unable to create network: number of layers is less than 1");
+    }
+
+    if (this.layers == null || this.layers[0] == null) {
+      if (this.layers == null) {
+        this.layers = new Layer[nLayers];
+      }
+
+      //First: Work out total length of params
+      long paramLength = 0;
+      val nParamsPerLayer = new long[nLayers];
+      for (int i = 0; i < nLayers; i++) {
+        NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
+        conf.getLayer().setDataType(netDtype);
+        nParamsPerLayer[i] = conf.getLayer().initializer().numParams(conf);
+        paramLength += nParamsPerLayer[i];
+      }
+
+      //Create parameters array, if required
+      boolean initializeParams;
+      if (parameters != null) {
+        if (!parameters.isRowVectorOrScalar()) {
+          throw new IllegalArgumentException("Invalid parameters: should be a row vector");
+        }
+        if (parameters.length() != paramLength) {
+          throw new IllegalArgumentException("Invalid parameters: expected length " + paramLength
+              + ", got length " + parameters.length());
+        }
+
+        if (cloneParametersArray) {
+          flattenedParams = parameters.dup();
+        } else {
+          flattenedParams = parameters;
+        }
+
+        initializeParams = false;
+      } else if (paramLength > 0) {
+        flattenedParams = Nd4j.create(netDtype, 1, paramLength);
+        initializeParams = true;
+      } else {
+        //Edge case: 0 params in network
+        flattenedParams = null;
+        initializeParams = false;
+      }
+
+      //Set RNG seed, for repeatability between initializations when set
+      if (initializeParams) {
+        Nd4j.getRandom().setSeed(getDefaultConfiguration().getSeed());
+      }
+
+      // construct multi-layer
+      long paramCountSoFar = 0;
+      for (int i = 0; i < nLayers; i++) {
+        INDArray paramsView;
+        if (nParamsPerLayer[i] > 0) {
+          paramsView = flattenedParams.get(NDArrayIndex.interval(0, 0, true),
+              NDArrayIndex.interval(paramCountSoFar, paramCountSoFar + nParamsPerLayer[i]));
+        } else {
+          paramsView = null;
+        }
+        paramCountSoFar += nParamsPerLayer[i];
+
+        NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
+        layers[i] = conf.getLayer()
+            .instantiate(conf, trainingListeners, i, paramsView, initializeParams, netDtype);
+        layerMap.put(conf.getLayer().getLayerName(), layers[i]);
+      }
+      initCalled = true;
+    }
+
+    //Set parameters in MultiLayerNetwork.defaultConfiguration for later use in BaseOptimizer.setupSearchState() etc
+    defaultConfiguration.clearVariables();
+    List<String> variables = defaultConfiguration.variables(false);
+    for (int i = 0; i < layers.length; i++) {
+      if (layers[i] == null) {
+        throw new IllegalStateException(
+            "Encountered null layer during initialization for layer " + i +
+                ": " + layerWiseConfigurations.getConf(i).getLayer().getClass().getSimpleName()
+                + " initialization " +
+                "returned null layer?");
+      }
+
+      for (String s : layers[i].conf().variables()) {
+        variables.add(i + "_" + s);
+      }
+    }
+
+    // now we init solver & optimizer
+    if (solver == null) {
+      try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+        solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
             .build();
-
-    protected WorkspaceConfiguration WS_LAYER_ACT_X_CONFIG;
-
-    protected static final WorkspaceConfiguration WS_RNN_LOOP_WORKING_MEM_CONFIG = WorkspaceConfiguration.builder()
-            .initialSize(0).overallocationLimit(0.05).policyReset(ResetPolicy.BLOCK_LEFT)
-            .policyAllocation(AllocationPolicy.OVERALLOCATE).policySpill(SpillPolicy.REALLOCATE)
-            .policyLearning(LearningPolicy.FIRST_LOOP).build();
-
-
-    public MultiLayerNetwork(MultiLayerConfiguration conf) {
-        this.layerWiseConfigurations = conf;
-        this.defaultConfiguration = conf.getConf(0).clone();
-
-        //Working memory: should learn over course of: (a) full forward pass, and (b) full backward pass
-        //Working memory should be opened once per layer and once per preprocessor, for each of forward and backward passes
-        int numWorkingMem = 2 * (layerWiseConfigurations.getConfs().size() + layerWiseConfigurations.getInputPreProcessors().size());
-        WS_LAYER_WORKING_MEM_CONFIG = getLayerWorkingMemWSConfig(numWorkingMem);
-        WS_LAYER_ACT_X_CONFIG = getLayerActivationWSConfig(layerWiseConfigurations.getConfs().size());
+        solver.initOptimizer();
+      }
     }
 
-    protected static WorkspaceConfiguration getLayerWorkingMemWSConfig(int numWorkingMemCycles){
-        return WorkspaceConfiguration.builder()
-                .initialSize(0)
-                .overallocationLimit(0.02)
-                .policyLearning(LearningPolicy.OVER_TIME)
-                .cyclesBeforeInitialization(numWorkingMemCycles)
-                .policyReset(ResetPolicy.BLOCK_LEFT)
-                .policySpill(SpillPolicy.REALLOCATE)
-                .policyAllocation(AllocationPolicy.OVERALLOCATE)
-                .build();
+    //Mark that input modification is allowed.
+    //TODO When is it safe to NOT skip the very first layer? It's not always safe...
+    // For example dropout + iterating over List<DataSet> that is used for multiple epochs...
+    for (int i = 1; i < layers.length; i++) {
+      layers[i].allowInputModification(true);
     }
 
-    protected static WorkspaceConfiguration getLayerActivationWSConfig(int numLayers){
-        //Activations memory: opened once per layer - for every second layer (preprocessors are within the loop).
-        //Technically we could set learning to numLayers / 2, but will set to numLayers for simplicity, and also to
-        // account for a backward pass
-        return WorkspaceConfiguration.builder()
-                .initialSize(0)
-                .overallocationLimit(0.02)
-                .policyLearning(LearningPolicy.OVER_TIME)
-                .cyclesBeforeInitialization(numLayers)
-                .policyReset(ResetPolicy.BLOCK_LEFT)
-                .policySpill(SpillPolicy.REALLOCATE)
-                .policyAllocation(AllocationPolicy.OVERALLOCATE)
-                .build();
+    synchronizeIterEpochCounts();
+  }
+
+  /**
+   * This method allows you to specificy GradientsAccumulator instance to be used with this
+   * model<br>
+   * <br>
+   * PLEASE NOTE: Do not use this method unless you understand how to use GradientsAccumulator &
+   * updates sharing.<br> PLEASE NOTE: Do not use this method on standalone model
+   *
+   * @param accumulator Gradient accumulator to use for the network
+   */
+  public void setGradientsAccumulator(GradientsAccumulator accumulator) {
+    if (!isInitCalled()) {
+      init();
     }
 
-    /**
-     * This method sets specified CacheMode for all layers within network
-     *
-     * @param mode
-     */
-    public void setCacheMode(CacheMode mode) {
-        if (mode == null)
-            mode = CacheMode.NONE;
-
-        for (Layer layer : layers) {
-            layer.setCacheMode(mode);
-        }
+    if (solver == null) {
+      try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+        solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
+            .build();
+      }
     }
 
-    /**
-     * Set the last ETL time in milliseconds, for informational/reporting purposes. Generally used internally.
-     * @param time    ETL time
-     */
-    public void setLastEtlTime(long time) {
-        lastEtlTime.set(time);
-    }
+    solver.getOptimizer().setGradientsAccumulator(accumulator);
+  }
 
-    /**
-     * Get the last ETL time. This in informational, and is the amount of time in milliseconds that was required
-     * to obtain the last DataSet/MultiDataSet during fitting.
-     * A value consistently above 0 may indicate a data feeding bottleneck, or no asynchronous data prefetching (async
-     * prefetch is enabled by default)
-     * @return The last ETL time in milliseconds, if avaliable (or 0 if not)
-     */
-    public long getLastEtlTime() {
-        Long time = lastEtlTime.get();
-        return time == null ? 0L : time;
-    }
+  public boolean isInitCalled() {
+    return initCalled;
+  }
 
-    /**
-     * Initialize the network based on the configuration (a MultiLayerConfiguration in JSON format) and parameters array
-     *
-     * @param conf   the configuration json
-     * @param params the parameters for the network
-     */
-    public MultiLayerNetwork(String conf, INDArray params) {
-        this(MultiLayerConfiguration.fromJson(conf));
+  /**
+   * This method: initializes the flattened gradients array (used in backprop) and sets the
+   * appropriate subset in all layers. As a general rule, this shouldn't ever need to be called
+   * manually when doing training via fit(DataSet) or fit(DataSetIterator)
+   */
+  public void initGradientsView() {
+    try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+      if (layers == null) {
         init();
-        setParameters(params);
+      }
+
+      int nLayers = layers.length;
+
+      //First: Work out total length of params
+      long paramLength = 0;
+      val nParamsPerLayer = new long[nLayers];
+      for (int i = 0; i < nLayers; i++) {
+        NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
+        nParamsPerLayer[i] = conf.getLayer().initializer().numParams(conf);
+        paramLength += nParamsPerLayer[i];
+      }
+
+      if (paramLength > 0) {
+        flattenedGradients = Nd4j.create(flattenedParams.dataType(), new long[]{1, paramLength},
+            'f'); //No need to initialize, as each layer will do it each iteration anyway
+      }
+
+      long paramsSoFar = 0;
+      for (int i = 0; i < layers.length; i++) {
+        if (nParamsPerLayer[i] == 0) {
+          continue; //This layer doesn't have any parameters...
+        }
+        INDArray thisLayerGradView = flattenedGradients.get(NDArrayIndex.interval(0, 0, true),
+            NDArrayIndex.interval(paramsSoFar, paramsSoFar + nParamsPerLayer[i]));
+        layers[i].setBackpropGradientsViewArray(thisLayerGradView);
+        paramsSoFar += nParamsPerLayer[i];
+      }
+    }
+  }
+
+  protected INDArray activationFromPrevLayer(int curr, INDArray input, boolean training,
+      LayerWorkspaceMgr mgr) {
+    if (getLayerWiseConfigurations().getInputPreProcess(curr) != null) {
+      input = getLayerWiseConfigurations().getInputPreProcess(curr)
+          .preProcess(input, getInputMiniBatchSize(), mgr);
     }
 
+    INDArray ret = layers[curr].activate(input, training, mgr);
+    return ret;
+  }
 
-    /**
-     * Initialize the network based on the configuration and parameters array
-     *
-     * @param conf   the configuration
-     * @param params the parameters
-     */
-    public MultiLayerNetwork(MultiLayerConfiguration conf, INDArray params) {
-        this(conf);
-        init();
-        setParameters(params);
+  /**
+   * Calculate activation for few layers at once. Suitable for autoencoder partial activation.
+   * <p>
+   * In example: in 10-layer deep autoencoder, layers 0 - 4 inclusive are used for encoding part,
+   * and layers 5-9 inclusive are used for decoding part.
+   *
+   * @param from first layer to be activated, inclusive
+   * @param to   last layer to be activated, inclusive
+   * @return the activation from the last layer
+   */
+  public INDArray activateSelectedLayers(int from, int to, INDArray input) {
+    if (input == null) {
+      throw new IllegalStateException("Unable to perform activation; no input found");
+    }
+    if (from < 0 || from >= layers.length || from >= to) {
+      throw new IllegalStateException("Unable to perform activation; FROM is out of layer space");
+    }
+    if (to < 1 || to >= layers.length) {
+      throw new IllegalStateException("Unable to perform activation; TO is out of layer space");
     }
 
+    try {
+      LayerWorkspaceMgr mgr = LayerWorkspaceMgr.noWorkspaces(helperWorkspaces);   //TODO
 
-    protected void intializeConfigurations() {
-        if (layerWiseConfigurations == null)
-            layerWiseConfigurations = new MultiLayerConfiguration.Builder().build();
-
-        if (layers == null)
-            layers = new Layer[getnLayers()];
-
-        if (defaultConfiguration == null)
-            defaultConfiguration = new NeuralNetConfiguration.Builder().build();
+      INDArray res = input;
+      for (int l = from; l <= to; l++) {
+        res = this.activationFromPrevLayer(l, res, false, mgr);
+      }
+      return res;
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
+  /**
+   * Compute all layer activations, from input to output of the output layer. Note that the input is
+   * included in the list: thus feedForward(in,train).get(0) is the inputs, .get(1) is the
+   * activations of layer 0, and so on.
+   *
+   * @param train Training: if true, perform forward pass/inference at training time. Usually,
+   *              inference is performed with train = false. This impacts whether dropout etc is
+   *              applied or not.
+   * @return The list of activations for each layer, including the input
+   */
+  public List<INDArray> feedForward(INDArray input, boolean train) {
+    setInput(input);
+    return feedForward(train);
+  }
 
-    /**
-     * Perform layerwise pretraining for one epoch - see {@link #pretrain(DataSetIterator, int)}
-     */
-    public void pretrain(DataSetIterator iter) {
-        pretrain(iter, 1);
+  /**
+   * Compute activations from input to output of the output layer. As per
+   * {@link #feedForward(INDArray, boolean)} but using the inputs that have previously been set
+   * using {@link #setInput(INDArray)}
+   *
+   * @return the list of activations for each layer
+   */
+  public List<INDArray> feedForward(boolean train) {
+    try {
+      return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layers.length - 1,
+          input, mask, null, true);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
-    /**
-     * Perform layerwise unsupervised training on all pre-trainable layers in the network (VAEs, Autoencoders, etc), for the specified
-     * number of epochs each. For example, if numEpochs=3, then layer 0 will be fit for 3 epochs, followed by layer 1
-     * for 3 epochs, and so on.<br>
-     * Note that pretraining will be performed on one layer after the other. To perform unsupervised training on a single layer,
-     * use {@link #pretrainLayer(int, DataSetIterator)}
-     *
-     * @param iter Training data
-     */
-    public void pretrain(DataSetIterator iter, int numEpochs){
-        if (flattenedGradients == null) {
-            initGradientsView();
-        }
-
-        for (int i = 0; i < getnLayers(); i++) {
-            pretrainLayer(i, iter, numEpochs);
-        }
+  /**
+   * Perform feed-forward, optionally (not) clearing the layer input arrays.<br> Note: when using
+   * clearInputs=false, there can be some performance and memory overhead: this is because the
+   * arrays are defined outside of workspaces (which are enabled by default) - otherwise,
+   * old/invalidated arrays could still be accessed after calling this method. Consequently: Don't
+   * use clearInputs=false unless you have a use case that requires them to remain after
+   * feed-forward has been completed
+   *
+   * @param train       training mode (true) or test mode (false)
+   * @param clearInputs If false: don't clear the layer inputs
+   * @return Activations from feed-forward
+   */
+  public List<INDArray> feedForward(boolean train, boolean clearInputs) {
+    try {
+      return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layers.length - 1,
+          input, mask, null, clearInputs);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
-    /**
-     * Fit for one epoch - see {@link #pretrainLayer(int, DataSetIterator, int)}
-     */
-    public void pretrainLayer(int layerIdx, DataSetIterator iter) {
-        pretrainLayer(layerIdx, iter, 1);
+  /**
+   * Compute the activations from the input to the specified layer.<br> To compute activations for
+   * all layers, use feedForward(...) methods<br> Note: output list includes the original input. So
+   * list.get(0) is always the original input, and list.get(i+1) is the activations of the ith
+   * layer.
+   *
+   * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
+   *                 feedForwardToLayer(i,input) will return the activations for layers 0..i
+   *                 (inclusive)
+   * @param input    Input to the network
+   * @return list of activations.
+   */
+  public List<INDArray> feedForwardToLayer(int layerNum, INDArray input) {
+    try {
+      return ffToLayerActivationsDetached(false, FwdPassType.STANDARD, false, layerNum, input, mask,
+          null, true);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
-    /**
-     * Perform layerwise unsupervised training on a single pre-trainable layer in the network (VAEs, Autoencoders, etc)
-     * for the specified number of epochs<br>
-     * If the specified layer index (0 to numLayers - 1) is not a pretrainable layer, this is a no-op.
-     *
-     * @param layerIdx  Index of the layer to train (0 to numLayers-1)
-     * @param iter      Training data
-     * @param numEpochs Number of epochs to fit the specified layer for
-     */
-    public void pretrainLayer(int layerIdx, DataSetIterator iter, int numEpochs) {
-        Preconditions.checkState(numEpochs > 0, "Number of epochs (%s) must be a positive number", numEpochs);
-
-        if (flattenedGradients == null) {
-            initGradientsView();
-        }
-        if (layerIdx >= layers.length) {
-            throw new IllegalArgumentException(
-                    "Cannot pretrain layer: layerIdx (" + layerIdx + ") >= numLayers (" + layers.length + ")");
-        }
-
-        Layer layer = layers[layerIdx];
-        if (!layer.isPretrainLayer())
-            return;
-
-        if(numEpochs > 1 && !iter.resetSupported())
-            throw new IllegalStateException("Cannot fit multiple epochs (" + numEpochs + ") on an iterator that doesn't support resetting");
-
-        if (!iter.hasNext() && iter.resetSupported()) {
-            iter.reset();
-        }
-
-        log.info("Starting unsupervised training on layer " + layerIdx + " for " + numEpochs + " epochs");
-        for(int i=0; i<numEpochs; i++ ) {
-            if(i > 0)
-                iter.reset();
-
-            while (iter.hasNext()) {
-                DataSet next = iter.next();
-                input = next.getFeatures();
-                pretrainLayer(layerIdx, input);
-            }
-        }
-
-        int ec = getLayer(layerIdx).conf().getEpochCount() + 1;
-        getLayer(layerIdx).conf().setEpochCount(ec);
+  /**
+   * Compute the activations from the input to the specified layer.<br> To compute activations for
+   * all layers, use feedForward(...) methods<br> Note: output list includes the original input. So
+   * list.get(0) is always the original input, and list.get(i+1) is the activations of the ith
+   * layer.
+   *
+   * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
+   *                 feedForwardToLayer(i,input) will return the activations for layers 0..i
+   *                 (inclusive)
+   * @param input    Input to the network
+   * @param train    true for training, false for test (i.e., false if using network after
+   *                 training)
+   * @return list of activations.
+   */
+  public List<INDArray> feedForwardToLayer(int layerNum, INDArray input, boolean train) {
+    try {
+      int layerVertexIdx = layers[layerNum].getIndex();
+      return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layerVertexIdx, input,
+          mask, null, true);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
-    /**
-     * Perform layerwise unsupervised training on a single pre-trainable layer in the network (VAEs, Autoencoders, etc)<br>
-     * If the specified layer index (0 to numLayers - 1) is not a pretrainable layer, this is a no-op.
-     *
-     * @param layerIdx Index of the layer to train (0 to numLayers-1)
-     * @param features Training data array
-     */
-    public void pretrainLayer(int layerIdx, INDArray features) {
-        setInput(features);
-        setLayerMaskArrays(null, null);
+  /**
+   * Compute the activations from the input to the specified layer, using the currently set input
+   * for the network.<br> To compute activations for all layers, use feedForward(...) methods<br>
+   * Note: output list includes the original input. So list.get(0) is always the original input, and
+   * list.get(i+1) is the activations of the ith layer.
+   *
+   * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
+   *                 feedForwardToLayer(i,input) will return the activations for layers 0..i
+   *                 (inclusive)
+   * @param train    true for training, false for test (i.e., false if using network after
+   *                 training)
+   * @return list of activations.
+   */
+  public List<INDArray> feedForwardToLayer(int layerNum, boolean train) {
+    try {
+      return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layerNum, input, mask,
+          null, true);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
 
-        if (flattenedGradients == null) {
-            initGradientsView();
-        }
-        if (layerIdx >= layers.length) {
-            throw new IllegalArgumentException(
-                    "Cannot pretrain layer: layerIdx (" + layerIdx + ") >= numLayers (" + layers.length + ")");
+  protected void validateArrayWorkspaces(LayerWorkspaceMgr mgr, INDArray array, ArrayType arrayType,
+      int layerIdx,
+      boolean isPreprocessor, String op) {
+    try {
+      mgr.validateArrayLocation(arrayType, array, false, layerIdx > 0);
+    } catch (ND4JWorkspaceException e) {
+      String layerName = layers[layerIdx].conf().getLayer().getLayerName();
+      String clazz;
+      if (isPreprocessor) {
+        clazz = layerWiseConfigurations.getInputPreProcess(layerIdx).getClass().getName();
+      } else {
+        clazz = layers[layerIdx].getClass().getName();
+      }
+      throw new IllegalStateException(
+          op + ": array (" + arrayType + ") workspace validation failed (" +
+              (isPreprocessor ? "preprocessor" : "layer ") + layerIdx + (layerName != null ?
+              " - layer name \"" +
+                  layerName + "\"" : "") + " - class: " + clazz
+              + ") - array is defined in incorrect workspace", e);
+    }
+  }
+
+  /**
+   * Feed-forward through the network - returning all array activations in a list, detached from any
+   * workspace. Note that no workspace should be active externally when calling this method (an
+   * exception will be thrown if a workspace is open externally)
+   *
+   * @param train             Training mode (true) or test/inference mode (false)
+   * @param fwdPassType       Type of forward pass to perform (STANDARD or
+   *                          RNN_ACTIVATE_WITH_STORED_STATE only)
+   * @param storeLastForTBPTT ONLY used if fwdPassType ==
+   *                          FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE
+   * @param layerIndex        Index (inclusive) to stop forward pass at. For all layers, use
+   *                          numLayers-1
+   * @param input             Input to the network
+   * @param fMask             Feature mask array. May be null.
+   * @param lMask             Label mask array. May be null.
+   * @param clearInputs       Whether the layer inputs should be cleared
+   * @return List of activations (including the input), detached from any workspace
+   */
+  protected synchronized List<INDArray> ffToLayerActivationsDetached(boolean train,
+      @NonNull FwdPassType fwdPassType,
+      boolean storeLastForTBPTT, int layerIndex, @NonNull INDArray input,
+      INDArray fMask, INDArray lMask, boolean clearInputs) {
+    setInput(input);
+    setLayerMaskArrays(fMask, lMask);
+
+    //Verify that no workspace is open externally
+    WorkspaceUtils.assertNoWorkspacesOpen(
+        "Expected no workspace active in ffToLayerActivationsDetached");
+
+    LayerWorkspaceMgr workspaceMgr;
+    WorkspaceMode wsm = (train ? layerWiseConfigurations.getTrainingWorkspaceMode()
+        : layerWiseConfigurations.getInferenceWorkspaceMode());
+    if (wsm == WorkspaceMode.NONE) {
+      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      workspaceMgr = LayerWorkspaceMgr.builder()
+          .noWorkspaceFor(ArrayType.ACTIVATIONS)
+          .with(ArrayType.INPUT, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+
+      if (input.isAttached()) {
+        //Don't leverage out of async DataSetIterator workspaces
+        workspaceMgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
+      }
+
+      if (!clearInputs) {
+        workspaceMgr.setScopedOutFor(ArrayType.INPUT);
+      }
+    }
+    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    List<INDArray> out = new ArrayList<>();
+    out.add(workspaceMgr.leverageTo(ArrayType.INPUT,
+        input));    //Should  be unnecessary (and no op), if layer is implemented correctly
+
+    for (int i = 0; i <= layerIndex; i++) {
+      try (MemoryWorkspace wsFFWorking = workspaceMgr.notifyScopeEntered(
+          ArrayType.FF_WORKING_MEM)) {
+        if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
+          input = getLayerWiseConfigurations().getInputPreProcess(i)
+              .preProcess(input, getInputMiniBatchSize(), workspaceMgr);
+          //Validation: Exception if invalid (bad preprocessor implementation)
+          validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, true,
+              "Feed forward to layer (inference)");
         }
 
-        LayerWorkspaceMgr workspaceMgr;
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+        if (fwdPassType == FwdPassType.STANDARD) {
+          input = layers[i].activate(input, train, workspaceMgr);
+        } else if (fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE) {
+          if (layers[i] instanceof RecurrentLayer) {
+            input = ((RecurrentLayer) layers[i]).rnnActivateUsingStoredState(input, train,
+                storeLastForTBPTT, workspaceMgr);
+          } else if (layers[i] instanceof BaseWrapperLayer
+              && ((BaseWrapperLayer) layers[i]).getUnderlying() instanceof RecurrentLayer) {
+            RecurrentLayer rl = (RecurrentLayer) ((BaseWrapperLayer) layers[i]).getUnderlying();
+            input = rl.rnnActivateUsingStoredState(input, train, storeLastForTBPTT, workspaceMgr);
+          } else if (layers[i] instanceof MultiLayerNetwork) {
+            List<INDArray> temp = ((MultiLayerNetwork) layers[i]).rnnActivateUsingStoredState(input,
+                train, storeLastForTBPTT);
+            input = temp.get(temp.size() - 1);
+          } else {
+            input = layers[i].activate(input, train, workspaceMgr);
+          }
         } else {
-            workspaceMgr = LayerWorkspaceMgr.builder()
-                    .defaultWorkspace(WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
+          throw new IllegalStateException(
+              "Forward pass type not supported for this method: " + fwdPassType);
         }
-        workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
 
-        Layer layer = layers[layerIdx];
-        if (!layer.isPretrainLayer())
-            return;
+        //Validation: Exception if invalid (bad layer implementation)
+        validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, false,
+            "Feed forward to layer (inference)");
 
-        //Do forward pass to the layer to be pretrained
-        INDArray outputOfPrevLayer;
-        if(layerIdx == 0) {
-            outputOfPrevLayer = input;
+        out.add(input);
+      }
+      if (clearInputs) {
+        layers[i].clear();
+      }
+    }
+
+    return out;
+  }
+
+  /**
+   * Feed-forward through the network at training time - returning a list of all activations in a
+   * workspace (WS_ALL_LAYERS_ACT) if workspaces are enabled for training; or detached if no
+   * workspaces are used.<br> Note: if using workspaces for training, this method requires that
+   * WS_ALL_LAYERS_ACT is open externally.<br> If using NO workspaces, requires that no external
+   * workspace is open<br> Note that this method does NOT clear the inputs to each layer - instead,
+   * they are in the WS_ALL_LAYERS_ACT workspace for use in later backprop.
+   *
+   * @param layerIndex        Index (inclusive) to stop forward pass at. For all layers, use
+   *                          numLayers-1
+   * @param fwdPassType       Type of forward pass to perform (STANDARD or
+   *                          RNN_ACTIVATE_WITH_STORED_STATE only)
+   * @param storeLastForTBPTT ONLY used if fwdPassType ==
+   *                          FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE
+   * @param input             Input to network
+   * @param fMask             Feature mask array. May be null
+   * @param lMask             Label mask aray. May be null.
+   * @return
+   */
+  protected synchronized List<INDArray> ffToLayerActivationsInWs(int layerIndex,
+      @NonNull FwdPassType fwdPassType, boolean storeLastForTBPTT,
+      @NonNull INDArray input, INDArray fMask, INDArray lMask) {
+    setInput(input);
+    setLayerMaskArrays(fMask, lMask);
+
+    LayerWorkspaceMgr workspaceMgr;
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      WorkspaceUtils.assertNoWorkspacesOpen(
+          "Expected no workspace active in ffToLayerActivationsInWs when training workspace is set to NONE");
+      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      workspaceMgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+
+      if (input.isAttached()) {
+        //Don't leverage out of async DataSetIterator workspaces
+        workspaceMgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
+      }
+
+      if (layerWiseConfigurations.getCacheMode() != CacheMode.NONE) {
+        //For now: store cache mode activations in activations workspace
+        workspaceMgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
+        workspaceMgr.setWorkspace(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM,
+            WS_LAYER_WORKING_MEM_CONFIG);
+      }
+
+      WorkspaceUtils.assertOpenAndActive(WS_ALL_LAYERS_ACT,
+          "ffToLayerActivationsInWs method requires workspace WS_ALL_LAYERS_ACT to be open");
+    }
+    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    List<INDArray> out = new ArrayList<>();
+    out.add(workspaceMgr.leverageTo(ArrayType.INPUT, input));    //Probably unnecessary usually
+
+    boolean traceLog = log.isTraceEnabled();
+
+    for (int i = 0; i <= layerIndex; i++) {
+      try (MemoryWorkspace wsFFWorking = workspaceMgr.notifyScopeEntered(
+          ArrayType.FF_WORKING_MEM)) {
+        if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
+          input = getLayerWiseConfigurations().getInputPreProcess(i)
+              .preProcess(input, getInputMiniBatchSize(), workspaceMgr);
+          //Validation: Exception if invalid (bad preprocessor implementation)
+          validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, true,
+              "Feed forward to layer (training)");
+        }
+
+        if (traceLog) {
+          log.trace("About to forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
+        }
+
+        if (fwdPassType == FwdPassType.STANDARD) {
+          input = layers[i].activate(input, true, workspaceMgr);
+        } else if (fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE) {
+          if (layers[i] instanceof RecurrentLayer) {
+            input = ((RecurrentLayer) layers[i]).rnnActivateUsingStoredState(input, true,
+                storeLastForTBPTT, workspaceMgr);
+          } else if (layers[i] instanceof BaseWrapperLayer
+              && ((BaseWrapperLayer) layers[i]).getUnderlying() instanceof RecurrentLayer) {
+            RecurrentLayer rl = (RecurrentLayer) ((BaseWrapperLayer) layers[i]).getUnderlying();
+            input = rl.rnnActivateUsingStoredState(input, true, storeLastForTBPTT, workspaceMgr);
+          } else if (layers[i] instanceof MultiLayerNetwork) {
+            List<INDArray> temp = ((MultiLayerNetwork) layers[i]).rnnActivateUsingStoredState(input,
+                true, storeLastForTBPTT);
+            input = temp.get(temp.size() - 1);
+          } else {
+            input = layers[i].activate(input, true, workspaceMgr);
+          }
         } else {
-            //Yes, this part of training - but we'll do forward psas as inference mode when doing layerwise training
-            // to effectively freeze earlier layers and not apply dropout etc
-            outputOfPrevLayer = outputOfLayerDetached(false, FwdPassType.STANDARD, layerIndex-1, features, null, null, null);
+          throw new IllegalStateException(
+              "FwdPassType not supported for this method: " + fwdPassType);
         }
 
-        try(MemoryWorkspace ws = workspaceMgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
-            if (layerWiseConfigurations.getInputPreProcess(layerIdx) != null) {
-
-                if (input.size(0) > Integer.MAX_VALUE)
-                    throw new ND4JArraySizeException();
-                outputOfPrevLayer = layerWiseConfigurations.getInputPreProcess(layerIdx).preProcess(outputOfPrevLayer, (int) input.size(0),
-                        LayerWorkspaceMgr.noWorkspaces(helperWorkspaces));
-            }
-
-            layer.fit(outputOfPrevLayer, workspaceMgr);
-        }
-    }
-
-    @Override
-    public int batchSize() {
-        //In 99+% of cases, the input and labels dimension 0 size should be identical
-        //The only real exceptions: space to batch, and batch to space layers
-        //In those cases, we should base it on the labels size, as this impacts gradient calculation
-        if (input.size(0) > Integer.MAX_VALUE || labels.size(0) > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-        return labels == null ? (int) input.size(0) : (int)labels.size(0);
-    }
-
-    @Override
-    public NeuralNetConfiguration conf() {
-        return defaultConfiguration;
-    }
-
-    @Override
-    public void setConf(NeuralNetConfiguration conf) {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public INDArray input() {
-        return input;
-    }
-
-    @Override
-    public ConvexOptimizer getOptimizer() {
-        return solver.getOptimizer();
-    }
-
-    /**
-     * Get one parameter array for the network.<br>
-     * In MultiLayerNetwork, parameters are keyed like "0_W" and "0_b" to mean "weights of layer index 0" and "biases
-     * of layer index 0" respectively. Numbers increment sequentially, and the suffixes ("W", "b" etc) depend on the
-     * layer type, and are defined in the relevant parameter initializers for each layer.<br>
-     * Note that the returned INDArrays are views of the underlying network parameters, so modifications of the returned
-     * arrays will impact the parameters of the network.
-     *
-     * @param param the key of the parameter
-     * @return The specified parameter array for the network
-     * @see #paramTable() paramTable() method, for a map of all parameters
-     */
-    @Override
-    public INDArray getParam(String param) {
-        //Get params for MultiLayerNetwork sub layers.
-        int idx = param.indexOf('_');
-        if (idx == -1)
-            throw new IllegalStateException("Invalid param key: does not have layer separator: \"" + param + "\"");
-        int layerIdx = Integer.parseInt(param.substring(0, idx));
-        String newKey = param.substring(idx + 1);
-
-        return layers[layerIdx].getParam(newKey);
-    }
-
-    /**
-     * Return a map of all parameters in the network. Parameter names are as described in {@link #getParam(String)}.
-     * As per {@link #getParam(String)} the returned arrays are views - modifications to these will impact
-     * the underlying network parameters
-     * @return A map of all parameters in the network
-     */
-    @Override
-    public Map<String, INDArray> paramTable() {
-        return paramTable(false);
-    }
-
-    /**
-     * Returns a map of all parameters in the network as per {@link #paramTable()}.<br>
-     * Optionally (with backpropParamsOnly=true) only the 'backprop' parameters are returned - that is, any parameters
-     * involved only in unsupervised layerwise pretraining not standard inference/backprop are excluded from the returned list.
-     * @param backpropParamsOnly If true, return backprop params only. If false: return all params
-     * @return Parameters for the network
-     */
-    public Map<String, INDArray> paramTable(boolean backpropParamsOnly) {
-        //Get all parameters from all layers
-        Map<String, INDArray> allParams = new LinkedHashMap<>();
-        for (int i = 0; i < layers.length; i++) {
-            Map<String, INDArray> paramMap = layers[i].paramTable(backpropParamsOnly);
-            for (Map.Entry<String, INDArray> entry : paramMap.entrySet()) {
-                String newKey = i + "_" + entry.getKey();
-                allParams.put(newKey, entry.getValue());
-            }
-        }
-        return allParams;
-    }
-
-    /**
-     * Intended for internal use
-     */
-    @Override
-    public boolean updaterDivideByMinibatch(String paramName) {
-        int idx = paramName.indexOf('_');
-        int layerIdx = Integer.parseInt(paramName.substring(0, idx));
-        String subName = paramName.substring(idx+1);
-        return getLayer(layerIdx).updaterDivideByMinibatch(subName);
-    }
-
-    /**
-     * Set the parameters of the netowrk. Note that the parameter keys must match the format as described in {@link #getParam(String)}
-     * and {@link #paramTable()}. Note that the values of the parameters used as an argument to this method are copied -
-     * i.e., it is safe to later modify/reuse the values in the provided paramTable without this impacting the network.
-     *
-     * @param paramTable    Parameters to set
-     */
-    @Override
-    public void setParamTable(Map<String, INDArray> paramTable) {
-        Map<String, INDArray> currParamTable = paramTable();
-        if (!currParamTable.keySet().equals(paramTable.keySet())) {
-            throw new IllegalArgumentException("Cannot set param table: parameter keys do not match.\n" + "Current: "
-                    + currParamTable.keySet() + "\nTo set: " + paramTable.keySet());
+        if (input == null) {
+          throw new IllegalStateException("Layer " + i + " returned null activations");
         }
 
-        for (String s : paramTable.keySet()) {
-            INDArray curr = currParamTable.get(s);
-            INDArray toSet = paramTable.get(s);
-            if (!Arrays.equals(curr.shape(), toSet.shape())) {
-                throw new IllegalArgumentException("Cannot set parameter table: parameter \"" + s + "\" shapes "
-                        + "do not match. Current = " + Arrays.toString(curr.shape()) + ", to set = "
-                        + Arrays.toString(toSet.shape()));
-            }
-        }
+        //Validation: Exception if invalid (bad layer implementation)
+        validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, false,
+            "Feed forward to layer (training)");
+        validateArrayWorkspaces(workspaceMgr, layers[i].input(), ArrayType.INPUT, i, false,
+            "Feed forward to layer (training)");
 
-        //Now that we've checked ALL params (to avoid leaving net in half-modified state)
-        for (String s : paramTable.keySet()) {
-            INDArray curr = currParamTable.get(s);
-            INDArray toSet = paramTable.get(s);
-            curr.assign(toSet);
+        out.add(input);
+
+        if (traceLog) {
+          log.trace("Completed forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
         }
+      }
     }
 
-    /**
-     * Set the values of a single parameter. See {@link #setParamTable(Map)} and {@link #getParam(String)} for more
-     * details.
-     * @param key the key of the parameter to set
-     * @param val the new values for the parameter
-     */
-    @Override
-    public void setParam(String key, INDArray val) {
-        //Set params for MultiLayerNetwork sub layers.
-        int idx = key.indexOf('_');
-        if (idx == -1)
-            throw new IllegalStateException("Invalid param key: not have layer separator: \"" + key + "\"");
-        int layerIdx = Integer.parseInt(key.substring(0, idx));
-        String newKey = key.substring(idx + 1);
-
-        layers[layerIdx].setParam(newKey, val);
-    }
-
-    /**
-     * Get the configuration for the network
-     * @return Network configuration
-     */
-    public MultiLayerConfiguration getLayerWiseConfigurations() {
-        return layerWiseConfigurations;
-    }
-
-    /**
-     * This method is intended for internal/developer use only.
-     */
-    public void setLayerWiseConfigurations(MultiLayerConfiguration layerWiseConfigurations) {
-        this.layerWiseConfigurations = layerWiseConfigurations;
-    }
-
-    /**
-     * Initialize the MultiLayerNetwork. This should be called once before the network is used.
-     * This is functionally equivalent to calling {@code init(null, false)}.
-     * @see MultiLayerNetwork#init(INDArray, boolean)
-     */
-    public void init() {
-        init(null, false);
-    }
-
-    /**
-     * Initialize the MultiLayerNetwork, optionally with an existing parameters array.
-     * If an existing parameters array is specified, it will be used (and the values will not be modified) in the network;
-     * if no parameters array is specified, parameters will be initialized randomly according to the network configuration.
-     *
-     * @param parameters              Network parameter. May be null. If null: randomly initialize.
-     * @param cloneParametersArray    Whether the parameter array (if any) should be cloned, or used directly
-     */
-    public void init(INDArray parameters, boolean cloneParametersArray) {
-        if (layerWiseConfigurations == null || layers == null)
-            intializeConfigurations();
-        if (initCalled)
-            return;
-
-        DataType netDtype = getLayerWiseConfigurations().getDataType();
-        if(parameters != null && parameters.dataType() != netDtype){
-            Preconditions.checkState(parameters.rank() == 2 && parameters.size(0) == 1, "Invalid parameters array: should be rank 2 with shape [1,numParams]. Got %ndShape", parameters);
-            if(cloneParametersArray){
-                try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
-                    parameters = parameters.castTo(netDtype);
-                }
-            } else {
-                throw new IllegalStateException("Error initializing network: Network datatype is set to " + netDtype
-                        + " but provided array has datatype " + parameters.dataType() + " with cloneParametersArray argument" +
-                        " set to false. Cannot initialize net with specified datatype array if that array does not match network datatype");
-            }
-        }
-
-
-        if (layerMap == null)
-            layerMap = new LinkedHashMap<>();
-
-        if (layerWiseConfigurations.getTrainingWorkspaceMode() == null)
-            layerWiseConfigurations.setTrainingWorkspaceMode(WorkspaceMode.NONE);
-
-        if (layerWiseConfigurations.getInferenceWorkspaceMode() == null)
-            layerWiseConfigurations.setInferenceWorkspaceMode(WorkspaceMode.NONE);
-
-        if (layerWiseConfigurations.getCacheMode() == null)
-            layerWiseConfigurations.setCacheMode(CacheMode.NONE);
-
-        OneTimeLogger.info(log, "Starting MultiLayerNetwork with WorkspaceModes set to [training: {}; inference: {}], cacheMode set to [{}]",
-                layerWiseConfigurations.getTrainingWorkspaceMode(),
-                layerWiseConfigurations.getInferenceWorkspaceMode(),
-                layerWiseConfigurations.getCacheMode());
-
-        int nLayers = getnLayers();
-
-        if (nLayers < 1)
-            throw new IllegalStateException("Unable to create network: number of layers is less than 1");
-
-        if (this.layers == null || this.layers[0] == null) {
-            if (this.layers == null)
-                this.layers = new Layer[nLayers];
-
-            //First: Work out total length of params
-            long paramLength = 0;
-            val nParamsPerLayer = new long[nLayers];
-            for (int i = 0; i < nLayers; i++) {
-                NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
-                conf.getLayer().setDataType(netDtype);
-                nParamsPerLayer[i] = conf.getLayer().initializer().numParams(conf);
-                paramLength += nParamsPerLayer[i];
-            }
-
-            //Create parameters array, if required
-            boolean initializeParams;
-            if (parameters != null) {
-                if (!parameters.isRowVectorOrScalar())
-                    throw new IllegalArgumentException("Invalid parameters: should be a row vector");
-                if (parameters.length() != paramLength)
-                    throw new IllegalArgumentException("Invalid parameters: expected length " + paramLength
-                            + ", got length " + parameters.length());
-
-                if (cloneParametersArray)
-                    flattenedParams = parameters.dup();
-                else
-                    flattenedParams = parameters;
-
-                initializeParams = false;
-            } else if(paramLength > 0){
-                flattenedParams = Nd4j.create(netDtype, 1, paramLength);
-                initializeParams = true;
-            } else {
-                //Edge case: 0 params in network
-                flattenedParams = null;
-                initializeParams = false;
-            }
-
-            //Set RNG seed, for repeatability between initializations when set
-            if (initializeParams) {
-                Nd4j.getRandom().setSeed(getDefaultConfiguration().getSeed());
-            }
-
-            // construct multi-layer
-            long paramCountSoFar = 0;
-            for (int i = 0; i < nLayers; i++) {
-                INDArray paramsView;
-                if (nParamsPerLayer[i] > 0) {
-                    paramsView = flattenedParams.get(NDArrayIndex.interval(0,0,true),
-                            NDArrayIndex.interval(paramCountSoFar, paramCountSoFar + nParamsPerLayer[i]));
-                } else {
-                    paramsView = null;
-                }
-                paramCountSoFar += nParamsPerLayer[i];
-
-                NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
-                layers[i] = conf.getLayer().instantiate(conf, trainingListeners, i, paramsView, initializeParams, netDtype);
-                layerMap.put(conf.getLayer().getLayerName(), layers[i]);
-            }
-            initCalled = true;
-        }
-
-        //Set parameters in MultiLayerNetwork.defaultConfiguration for later use in BaseOptimizer.setupSearchState() etc
-        defaultConfiguration.clearVariables();
-        List<String> variables = defaultConfiguration.variables(false);
-        for (int i = 0; i < layers.length; i++) {
-            if(layers[i] == null){
-                throw new IllegalStateException("Encountered null layer during initialization for layer " + i +
-                        ": " + layerWiseConfigurations.getConf(i).getLayer().getClass().getSimpleName() + " initialization " +
-                        "returned null layer?");
-            }
-
-            for (String s : layers[i].conf().variables()) {
-                variables.add(i + "_" + s);
-            }
-        }
-
-        // now we init solver & optimizer
-        if (solver == null) {
-            try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this).build();
-                solver.initOptimizer();
-            }
-        }
-
-        //Mark that input modification is allowed.
-        //TODO When is it safe to NOT skip the very first layer? It's not always safe...
-        // For example dropout + iterating over List<DataSet> that is used for multiple epochs...
-        for( int i=1; i<layers.length; i++ ){
-            layers[i].allowInputModification(true);
-        }
-
-        synchronizeIterEpochCounts();
-    }
-
-    /**
-     * This method allows you to specificy GradientsAccumulator instance to be used with this model<br>
-     * <br>
-     * PLEASE NOTE: Do not use this method unless you understand how to use GradientsAccumulator & updates sharing.<br>
-     * PLEASE NOTE: Do not use this method on standalone model
-     *
-     * @param accumulator    Gradient accumulator to use for the network
-     */
-    public void setGradientsAccumulator(GradientsAccumulator accumulator) {
-        if (!isInitCalled())
-            init();
-
-        if (solver == null) {
-            try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
-                        .build();
-            }
-        }
-
-        solver.getOptimizer().setGradientsAccumulator(accumulator);
-    }
-
-    public boolean isInitCalled() {
-        return initCalled;
-    }
-
-    /**
-     * This method: initializes the flattened gradients array (used in backprop) and sets the appropriate subset in all layers.
-     * As a general rule, this shouldn't ever need to be called manually when doing training via fit(DataSet) or fit(DataSetIterator)
-     */
-    public void initGradientsView() {
-        try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-            if (layers == null)
-                init();
-
-            int nLayers = layers.length;
-
-            //First: Work out total length of params
-            long paramLength = 0;
-            val nParamsPerLayer = new long[nLayers];
-            for (int i = 0; i < nLayers; i++) {
-                NeuralNetConfiguration conf = layerWiseConfigurations.getConf(i);
-                nParamsPerLayer[i] = conf.getLayer().initializer().numParams(conf);
-                paramLength += nParamsPerLayer[i];
-            }
-
-            if(paramLength > 0) {
-                flattenedGradients = Nd4j.create(flattenedParams.dataType(), new long[]{1, paramLength}, 'f'); //No need to initialize, as each layer will do it each iteration anyway
-            }
-
-            long paramsSoFar = 0;
-            for (int i = 0; i < layers.length; i++) {
-                if (nParamsPerLayer[i] == 0)
-                    continue; //This layer doesn't have any parameters...
-                INDArray thisLayerGradView = flattenedGradients.get(NDArrayIndex.interval(0,0,true),
-                        NDArrayIndex.interval(paramsSoFar, paramsSoFar + nParamsPerLayer[i]));
-                layers[i].setBackpropGradientsViewArray(thisLayerGradView);
-                paramsSoFar += nParamsPerLayer[i];
-            }
-        }
-    }
-
-    protected INDArray activationFromPrevLayer(int curr, INDArray input, boolean training, LayerWorkspaceMgr mgr) {
-        if (getLayerWiseConfigurations().getInputPreProcess(curr) != null) {
-            input = getLayerWiseConfigurations().getInputPreProcess(curr).preProcess(input, getInputMiniBatchSize(), mgr);
-        }
-
-        INDArray ret = layers[curr].activate(input, training, mgr);
-        return ret;
-    }
-
-    /**
-     * Calculate activation for few layers at once. Suitable for autoencoder partial activation.
-     *
-     * In example: in 10-layer deep autoencoder, layers 0 - 4 inclusive are used for encoding part, and layers 5-9 inclusive are used for decoding part.
-     *
-     * @param from first layer to be activated, inclusive
-     * @param to last layer to be activated, inclusive
-     * @return the activation from the last layer
-     */
-    public INDArray activateSelectedLayers(int from, int to, INDArray input) {
-        if (input == null)
-            throw new IllegalStateException("Unable to perform activation; no input found");
-        if (from < 0 || from >= layers.length || from >= to)
-            throw new IllegalStateException("Unable to perform activation; FROM is out of layer space");
-        if (to < 1 || to >= layers.length)
-            throw new IllegalStateException("Unable to perform activation; TO is out of layer space");
-
-        try {
-            LayerWorkspaceMgr mgr = LayerWorkspaceMgr.noWorkspaces(helperWorkspaces);   //TODO
-
-            INDArray res = input;
-            for (int l = from; l <= to; l++) {
-                res = this.activationFromPrevLayer(l, res, false, mgr);
-            }
-            return res;
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /**
-     * Compute all layer activations, from input to output of the output layer.
-     * Note that the input is included in the list: thus feedForward(in,train).get(0) is the inputs,
-     * .get(1) is the activations of layer 0, and so on.
-     *
-     * @param train Training: if true, perform forward pass/inference at training time. Usually, inference is performed
-     *              with train = false. This impacts whether dropout etc is applied or not.
-     * @return The list of activations for each layer, including the input
-     */
-    public List<INDArray> feedForward(INDArray input, boolean train) {
-        setInput(input);
-        return feedForward(train);
-    }
-
-    /**
-     * Compute activations from input to output of the output layer.
-     * As per {@link #feedForward(INDArray, boolean)} but using the inputs that have previously been set using {@link #setInput(INDArray)}
-     *
-     * @return the list of activations for each layer
-     */
-    public List<INDArray> feedForward(boolean train) {
-        try {
-            return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layers.length-1,
-                    input, mask, null, true);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /**
-     * Perform feed-forward, optionally (not) clearing the layer input arrays.<br>
-     * Note: when using clearInputs=false, there can be some performance and memory overhead: this is because the arrays are
-     * defined outside of workspaces (which are enabled by default) - otherwise, old/invalidated arrays could still be
-     * accessed after calling this method. Consequently: Don't use clearInputs=false unless you have a use case that
-     * requires them to remain after feed-forward has been completed
-     *
-     * @param train       training mode (true) or test mode (false)
-     * @param clearInputs If false: don't clear the layer inputs
-     * @return Activations from feed-forward
-     */
-    public List<INDArray> feedForward(boolean train, boolean clearInputs){
-        try{
-            return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layers.length-1, input, mask, null, clearInputs);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /** Compute the activations from the input to the specified layer.<br>
-     * To compute activations for all layers, use feedForward(...) methods<br>
-     * Note: output list includes the original input. So list.get(0) is always the original input, and
-     * list.get(i+1) is the activations of the ith layer.
-     * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
-     *                 feedForwardToLayer(i,input) will return the activations for layers 0..i (inclusive)
-     * @param input Input to the network
-     * @return list of activations.
-     */
-    public List<INDArray> feedForwardToLayer(int layerNum, INDArray input) {
-        try{
-            return ffToLayerActivationsDetached(false, FwdPassType.STANDARD, false, layerNum, input, mask, null, true);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /** Compute the activations from the input to the specified layer.<br>
-     * To compute activations for all layers, use feedForward(...) methods<br>
-     * Note: output list includes the original input. So list.get(0) is always the original input, and
-     * list.get(i+1) is the activations of the ith layer.
-     * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
-     *                 feedForwardToLayer(i,input) will return the activations for layers 0..i (inclusive)
-     * @param input Input to the network
-     * @param train true for training, false for test (i.e., false if using network after training)
-     * @return list of activations.
-     */
-    public List<INDArray> feedForwardToLayer(int layerNum, INDArray input, boolean train) {
-        try {
-            int layerVertexIdx = layers[layerNum].getIndex();
-            return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layerVertexIdx, input, mask, null, true);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /** Compute the activations from the input to the specified layer, using the currently set input for the network.<br>
-     * To compute activations for all layers, use feedForward(...) methods<br>
-     * Note: output list includes the original input. So list.get(0) is always the original input, and
-     * list.get(i+1) is the activations of the ith layer.
-     * @param layerNum Index of the last layer to calculate activations for. Layers are zero-indexed.
-     *                 feedForwardToLayer(i,input) will return the activations for layers 0..i (inclusive)
-     * @param train true for training, false for test (i.e., false if using network after training)
-     * @return list of activations.
-     */
-    public List<INDArray> feedForwardToLayer(int layerNum, boolean train) {
-        try {
-            return ffToLayerActivationsDetached(train, FwdPassType.STANDARD, false, layerNum, input, mask, null, true);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-
-    protected void validateArrayWorkspaces(LayerWorkspaceMgr mgr, INDArray array, ArrayType arrayType, int layerIdx,
-                                           boolean isPreprocessor, String op){
-        try{
-            mgr.validateArrayLocation(arrayType, array, false, layerIdx > 0);
-        } catch (ND4JWorkspaceException e){
-            String layerName = layers[layerIdx].conf().getLayer().getLayerName();
-            String clazz;
-            if(isPreprocessor){
-                clazz = layerWiseConfigurations.getInputPreProcess(layerIdx).getClass().getName();
-            } else {
-                clazz = layers[layerIdx].getClass().getName();
-            }
-            throw new IllegalStateException(op + ": array (" + arrayType + ") workspace validation failed (" +
-                    (isPreprocessor ? "preprocessor" : "layer ") + layerIdx + (layerName != null ? " - layer name \"" +
-                    layerName + "\"" : "") + " - class: " + clazz + ") - array is defined in incorrect workspace", e);
-        }
-    }
-
-    /**
-     * Feed-forward through the network - returning all array activations in a list, detached from any workspace.
-     * Note that no workspace should be active externally when calling this method (an exception will be thrown
-     * if a workspace is open externally)
-     *
-     * @param train             Training mode (true) or test/inference mode (false)
-     * @param fwdPassType       Type of forward pass to perform (STANDARD or RNN_ACTIVATE_WITH_STORED_STATE only)
-     * @param storeLastForTBPTT ONLY used if fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE
-     * @param layerIndex        Index (inclusive) to stop forward pass at. For all layers, use numLayers-1
-     * @param input             Input to the network
-     * @param fMask             Feature mask array. May be null.
-     * @param lMask             Label mask array. May be null.
-     * @param clearInputs       Whether the layer inputs should be cleared
-     * @return List of activations (including the input), detached from any workspace
-     */
-    protected synchronized List<INDArray> ffToLayerActivationsDetached(boolean train, @NonNull FwdPassType fwdPassType,
-                                                                       boolean storeLastForTBPTT, int layerIndex, @NonNull INDArray input,
-                                                                       INDArray fMask, INDArray lMask, boolean clearInputs){
-        setInput(input);
-        setLayerMaskArrays(fMask, lMask);
-
-        //Verify that no workspace is open externally
-        WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in ffToLayerActivationsDetached");
-
-        LayerWorkspaceMgr workspaceMgr;
-        WorkspaceMode wsm = (train ? layerWiseConfigurations.getTrainingWorkspaceMode() : layerWiseConfigurations.getInferenceWorkspaceMode());
-        if(wsm == WorkspaceMode.NONE){
-            workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            workspaceMgr = LayerWorkspaceMgr.builder()
-                    .noWorkspaceFor(ArrayType.ACTIVATIONS)
-                    .with(ArrayType.INPUT, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
-
-            if(input.isAttached()){
-                //Don't leverage out of async DataSetIterator workspaces
-                workspaceMgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
-            }
-
-            if(!clearInputs){
-                workspaceMgr.setScopedOutFor(ArrayType.INPUT);
-            }
-        }
-        workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        List<INDArray> out = new ArrayList<>();
-        out.add(workspaceMgr.leverageTo(ArrayType.INPUT, input));    //Should  be unnecessary (and no op), if layer is implemented correctly
-
-        for( int i=0; i<=layerIndex; i++ ){
-            try(MemoryWorkspace wsFFWorking = workspaceMgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)){
-                if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
-                    input = getLayerWiseConfigurations().getInputPreProcess(i).preProcess(input, getInputMiniBatchSize(), workspaceMgr);
-                    //Validation: Exception if invalid (bad preprocessor implementation)
-                    validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, true, "Feed forward to layer (inference)");
-                }
-
-                if(fwdPassType == FwdPassType.STANDARD){
-                    input = layers[i].activate(input, train, workspaceMgr);
-                } else if (fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE) {
-                    if (layers[i] instanceof RecurrentLayer) {
-                        input = ((RecurrentLayer) layers[i]).rnnActivateUsingStoredState(input, train,
-                                storeLastForTBPTT, workspaceMgr);
-                    } else if(layers[i] instanceof BaseWrapperLayer && ((BaseWrapperLayer)layers[i]).getUnderlying() instanceof RecurrentLayer) {
-                        RecurrentLayer rl = (RecurrentLayer) ((BaseWrapperLayer)layers[i]).getUnderlying();
-                        input = rl.rnnActivateUsingStoredState(input, train,storeLastForTBPTT, workspaceMgr);
-                    } else if (layers[i] instanceof MultiLayerNetwork) {
-                        List<INDArray> temp = ((MultiLayerNetwork) layers[i]).rnnActivateUsingStoredState(input, train, storeLastForTBPTT);
-                        input = temp.get(temp.size() - 1);
-                    } else {
-                        input = layers[i].activate(input, train, workspaceMgr);
-                    }
-                } else {
-                    throw new IllegalStateException("Forward pass type not supported for this method: " + fwdPassType);
-                }
-
-                //Validation: Exception if invalid (bad layer implementation)
-                validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, false, "Feed forward to layer (inference)");
-
-                out.add(input);
-            }
-            if(clearInputs) {
-                layers[i].clear();
-            }
-        }
-
-        return out;
-    }
-
-    /**
-     * Feed-forward through the network at training time - returning a list of all activations in a workspace (WS_ALL_LAYERS_ACT)
-     * if workspaces are enabled for training; or detached if no workspaces are used.<br>
-     * Note: if using workspaces for training, this method requires that WS_ALL_LAYERS_ACT is open externally.<br>
-     * If using NO workspaces, requires that no external workspace is open<br>
-     * Note that this method does NOT clear the inputs to each layer - instead, they are in the WS_ALL_LAYERS_ACT workspace
-     * for use in later backprop.
-     *
-     * @param layerIndex        Index (inclusive) to stop forward pass at. For all layers, use numLayers-1
-     * @param fwdPassType       Type of forward pass to perform (STANDARD or RNN_ACTIVATE_WITH_STORED_STATE only)
-     * @param storeLastForTBPTT ONLY used if fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE
-     * @param input             Input to network
-     * @param fMask             Feature mask array. May be null
-     * @param lMask             Label mask aray. May be null.
-     * @return
-     */
-    protected synchronized List<INDArray> ffToLayerActivationsInWs(int layerIndex, @NonNull FwdPassType fwdPassType, boolean storeLastForTBPTT,
-                                                                   @NonNull INDArray input, INDArray fMask, INDArray lMask){
-        setInput(input);
-        setLayerMaskArrays(fMask, lMask);
-
-        LayerWorkspaceMgr workspaceMgr;
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in ffToLayerActivationsInWs when training workspace is set to NONE");
-            workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            workspaceMgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
-
-            if(input.isAttached()){
-                //Don't leverage out of async DataSetIterator workspaces
-                workspaceMgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
-            }
-
-            if(layerWiseConfigurations.getCacheMode() != CacheMode.NONE){
-                //For now: store cache mode activations in activations workspace
-                workspaceMgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
-                workspaceMgr.setWorkspace(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG);
-            }
-
-            WorkspaceUtils.assertOpenAndActive(WS_ALL_LAYERS_ACT, "ffToLayerActivationsInWs method requires workspace WS_ALL_LAYERS_ACT to be open");
-        }
-        workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        List<INDArray> out = new ArrayList<>();
-        out.add(workspaceMgr.leverageTo(ArrayType.INPUT, input));    //Probably unnecessary usually
-
-        boolean traceLog = log.isTraceEnabled();
-
-        for( int i = 0; i <=layerIndex; i++) {
-            try(MemoryWorkspace wsFFWorking = workspaceMgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)){
-                if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
-                    input = getLayerWiseConfigurations().getInputPreProcess(i).preProcess(input, getInputMiniBatchSize(), workspaceMgr);
-                    //Validation: Exception if invalid (bad preprocessor implementation)
-                    validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, true, "Feed forward to layer (training)");
-                }
-
-                if(traceLog){
-                    log.trace("About to forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-
-                if(fwdPassType == FwdPassType.STANDARD){
-                    input = layers[i].activate(input, true, workspaceMgr);
-                } else if(fwdPassType == FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE){
-                    if (layers[i] instanceof RecurrentLayer) {
-                        input = ((RecurrentLayer) layers[i]).rnnActivateUsingStoredState(input, true, storeLastForTBPTT, workspaceMgr);
-                    }else if(layers[i] instanceof BaseWrapperLayer && ((BaseWrapperLayer)layers[i]).getUnderlying() instanceof RecurrentLayer) {
-                        RecurrentLayer rl = (RecurrentLayer) ((BaseWrapperLayer)layers[i]).getUnderlying();
-                        input = rl.rnnActivateUsingStoredState(input, true, storeLastForTBPTT, workspaceMgr);
-                    } else if (layers[i] instanceof MultiLayerNetwork) {
-                        List<INDArray> temp = ((MultiLayerNetwork) layers[i]).rnnActivateUsingStoredState(input, true, storeLastForTBPTT);
-                        input = temp.get(temp.size() - 1);
-                    } else {
-                        input = layers[i].activate(input, true, workspaceMgr);
-                    }
-                } else {
-                    throw new IllegalStateException("FwdPassType not supported for this method: " + fwdPassType);
-                }
-
-                if(input == null){
-                    throw new IllegalStateException("Layer " + i + " returned null activations");
-                }
-
-                //Validation: Exception if invalid (bad layer implementation)
-                validateArrayWorkspaces(workspaceMgr, input, ArrayType.ACTIVATIONS, i, false, "Feed forward to layer (training)");
-                validateArrayWorkspaces(workspaceMgr, layers[i].input(), ArrayType.INPUT, i, false, "Feed forward to layer (training)");
-
-                out.add(input);
-
-                if(traceLog){
-                    log.trace("Completed forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-            }
-        }
-
-        return out;
-    }
-
-    /**
-     * Provide the output of the specified layer, detached from any workspace. This is most commonly used at inference/test
-     * time, and is more memory efficient than {@link #ffToLayerActivationsDetached(boolean, FwdPassType, boolean, int, INDArray, INDArray, INDArray, boolean)}
-     * and {@link #ffToLayerActivationsInWs(int, FwdPassType, boolean, INDArray, INDArray, INDArray)}.<br>
-     * This method clears all layer inputs.
-     *
-     * NOTE: in general, no workspaces should be activated externally for this method!
-     * This method handles the workspace activation as required
-     *
-     * @param train             Training mode (true) or test/inference mode (false)
-     * @param fwdPassType       Type of forward pass to perform (STANDARD, RNN_TIMESTEP or RNN_ACTIVATE_WITH_STORED_STATE)
-     * @param layerIndex        Index (inclusive) to stop forward pass at. For all layers, use numLayers-1
-     * @param input             Input to the network
-     * @param featureMask       Input/feature mask array. May be null.
-     * @param labelsMask        Labels mask array. May be null
-     * @param outputWorkspace   Optional - if provided, outputs should be placed in this workspace. NOTE: this workspace
-     *                          must be open
-     * @return                  Output of the specified layer, detached from any workspace
-     */
-    protected INDArray outputOfLayerDetached(boolean train, @NonNull FwdPassType fwdPassType, int layerIndex, @NonNull INDArray input,
-                                             INDArray featureMask, INDArray labelsMask, MemoryWorkspace outputWorkspace){
-        setInput(input);
-        setLayerMaskArrays(featureMask, labelsMask);
+    return out;
+  }
+
+  /**
+   * Provide the output of the specified layer, detached from any workspace. This is most commonly
+   * used at inference/test time, and is more memory efficient than
+   * {@link #ffToLayerActivationsDetached(boolean, FwdPassType, boolean, int, INDArray, INDArray,
+   * INDArray, boolean)} and
+   * {@link #ffToLayerActivationsInWs(int, FwdPassType, boolean, INDArray, INDArray, INDArray)}.<br>
+   * This method clears all layer inputs.
+   * <p>
+   * NOTE: in general, no workspaces should be activated externally for this method! This method
+   * handles the workspace activation as required
+   *
+   * @param train           Training mode (true) or test/inference mode (false)
+   * @param fwdPassType     Type of forward pass to perform (STANDARD, RNN_TIMESTEP or
+   *                        RNN_ACTIVATE_WITH_STORED_STATE)
+   * @param layerIndex      Index (inclusive) to stop forward pass at. For all layers, use
+   *                        numLayers-1
+   * @param input           Input to the network
+   * @param featureMask     Input/feature mask array. May be null.
+   * @param labelsMask      Labels mask array. May be null
+   * @param outputWorkspace Optional - if provided, outputs should be placed in this workspace.
+   *                        NOTE: this workspace must be open
+   * @return Output of the specified layer, detached from any workspace
+   */
+  protected INDArray outputOfLayerDetached(boolean train, @NonNull FwdPassType fwdPassType,
+      int layerIndex, @NonNull INDArray input,
+      INDArray featureMask, INDArray labelsMask, MemoryWorkspace outputWorkspace) {
+    setInput(input);
+    setLayerMaskArrays(featureMask, labelsMask);
 
         /*
         Idea here: we want to minimize memory, and return only the final array
@@ -1203,672 +1341,731 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, org.d
 
         Additionally, we'll reconfigure the workspace manager for the *final* layer, so that we don't have to detach
          */
-        if(outputWorkspace == null || outputWorkspace instanceof DummyWorkspace) {
-            WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in outputOfLayerDetached", true);
-        } else {
-            Preconditions.checkState(outputWorkspace.isScopeActive(), "Workspace \"" + outputWorkspace.getId() +
-                    "\" was provided for the network/layer outputs. When provided, this workspace must be opened before " +
-                    "calling the output method; furthermore, closing the workspace is the responsibility of the user");
+    if (outputWorkspace == null || outputWorkspace instanceof DummyWorkspace) {
+      WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in outputOfLayerDetached",
+          true);
+    } else {
+      Preconditions.checkState(outputWorkspace.isScopeActive(),
+          "Workspace \"" + outputWorkspace.getId() +
+              "\" was provided for the network/layer outputs. When provided, this workspace must be opened before "
+              +
+              "calling the output method; furthermore, closing the workspace is the responsibility of the user");
+    }
+
+    LayerWorkspaceMgr mgrEven;
+    LayerWorkspaceMgr mgrOdd;
+
+    WorkspaceMode wsm = train ? layerWiseConfigurations.getTrainingWorkspaceMode()
+        : layerWiseConfigurations.getInferenceWorkspaceMode();
+    if (wsm == WorkspaceMode.NONE) {
+      mgrEven = LayerWorkspaceMgr.noWorkspaces();
+      mgrOdd = mgrEven;
+
+      //Check for external workspace - doesn't make sense to have one with workspace mode NONE
+      if (outputWorkspace != null && !(outputWorkspace instanceof DummyWorkspace)) {
+        throw new IllegalStateException("Workspace \"" + outputWorkspace.getId() +
+            "\" was provided for the network/layer outputs, however " + (train ? "training"
+            : "inference") +
+            " workspace mode is set to NONE. Cannot put output activations into the specified workspace if"
+            +
+            "workspaces are disabled for the network. use getConfiguration().setTraining/InferenceWorkspaceMode(WorkspaceMode.ENABLED)");
+      }
+    } else {
+      mgrEven = LayerWorkspaceMgr.builder()
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_LAYER_ACT_1, WS_LAYER_ACT_X_CONFIG)
+          .with(ArrayType.INPUT, WS_LAYER_ACT_2,
+              WS_LAYER_ACT_X_CONFIG)            //Inputs should always be in the previous WS
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+
+      mgrOdd = LayerWorkspaceMgr.builder()
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG)
+          .with(ArrayType.INPUT, WS_LAYER_ACT_1,
+              WS_LAYER_ACT_X_CONFIG)            //Inputs should always be in the previous WS
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+    }
+    mgrEven.setHelperWorkspacePointers(helperWorkspaces);
+    mgrOdd.setHelperWorkspacePointers(helperWorkspaces);
+
+    MemoryWorkspace wsActCloseNext = null;
+    MemoryWorkspace temp = null;
+    MemoryWorkspace initialWorkspace = Nd4j.getMemoryManager().getCurrentWorkspace();
+
+    boolean traceLog = log.isTraceEnabled();
+
+    Throwable t = null;
+    try {
+      for (int i = 0; i <= layerIndex; i++) {
+        LayerWorkspaceMgr mgr = (i % 2 == 0 ? mgrEven : mgrOdd);
+
+        if (traceLog) {
+          log.trace("About to forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
         }
 
-        LayerWorkspaceMgr mgrEven;
-        LayerWorkspaceMgr mgrOdd;
+        //Edge case: for first layer with dropout, inputs can't be in previous workspace (as it hasn't been opened yet)
+        //Hence: put inputs in working memory
+        if (i == 0 && wsm != WorkspaceMode.NONE) {
+          mgr.setWorkspace(ArrayType.INPUT, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG);
+        }
 
-        WorkspaceMode wsm = train ? layerWiseConfigurations.getTrainingWorkspaceMode() : layerWiseConfigurations.getInferenceWorkspaceMode();
-        if(wsm == WorkspaceMode.NONE){
-            mgrEven = LayerWorkspaceMgr.noWorkspaces();
-            mgrOdd = mgrEven;
+        try (MemoryWorkspace wsFFWorking = mgr.notifyScopeEntered(
+            ArrayType.FF_WORKING_MEM)) { //Working memory: opened/closed once per layer
+          //Activations workspaces: opened/closed every second layer.
+          //So mgrEven (WS_LAYER_ACT_1) open at start of 0, 2, 4, 8; closed at end of 1, 3, 5, 7 etc
+          //and mgrOdd (WS_LAYER_ACT_2) opened at start of 1, 3, 5, 7; closed at end of 2, 4, 6, 8 etc
+          temp = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS);
 
-            //Check for external workspace - doesn't make sense to have one with workspace mode NONE
-            if(outputWorkspace != null && !(outputWorkspace instanceof DummyWorkspace)){
-                throw new IllegalStateException("Workspace \"" + outputWorkspace.getId() +
-                        "\" was provided for the network/layer outputs, however " + (train ? "training" : "inference") +
-                        " workspace mode is set to NONE. Cannot put output activations into the specified workspace if" +
-                        "workspaces are disabled for the network. use getConfiguration().setTraining/InferenceWorkspaceMode(WorkspaceMode.ENABLED)");
+          //Note that because we're opening activation workspaces not in a simple nested order, we'll manually
+          // override the previous workspace setting. Otherwise, when we close these workspaces, the "current"
+          // workspace may be set to the incorrect one
+          temp.setPreviousWorkspace(initialWorkspace);
+
+          if (i == 0 && input.isAttached()) {
+            //Don't leverage out of async DataSetIterator workspaces
+            mgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
+          }
+
+          if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
+            input = getLayerWiseConfigurations().getInputPreProcess(i)
+                .preProcess(input, getInputMiniBatchSize(), mgr);
+            //Validation: Exception if invalid (bad preprocessor implementation)
+            validateArrayWorkspaces(mgr, input, ArrayType.ACTIVATIONS, i, true,
+                "Output of layer (inference)");
+          }
+
+          if (i == layerIndex) {
+            if (outputWorkspace != null && !(outputWorkspace instanceof DummyWorkspace)) {
+              //Place activations in user-specified workspace
+              mgr.setWorkspace(ArrayType.ACTIVATIONS, outputWorkspace.getId(),
+                  outputWorkspace.getWorkspaceConfiguration());
+            } else {
+              //Final activations: should be detached
+              mgr.setScopedOutFor(ArrayType.ACTIVATIONS);
             }
-        } else {
-            mgrEven = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_LAYER_ACT_1, WS_LAYER_ACT_X_CONFIG)
-                    .with(ArrayType.INPUT, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG)            //Inputs should always be in the previous WS
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
+          }
 
-            mgrOdd = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG)
-                    .with(ArrayType.INPUT, WS_LAYER_ACT_1, WS_LAYER_ACT_X_CONFIG)            //Inputs should always be in the previous WS
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
+          if (fwdPassType == FwdPassType.STANDARD) {
+            //Standard feed-forward case
+            if (i > 0 && ConvolutionUtils.layerHasConvolutionLayout(layers[i - 1].conf().getLayer())
+                && ConvolutionUtils.layerHasConvolutionLayout(layers[i].conf().getLayer())) {
+
+              CNN2DFormat preLayerFormat = ConvolutionUtils.getFormatForLayer(
+                  layers[i - 1].conf().getLayer());
+              CNN2DFormat currLayerFormat = ConvolutionUtils.getFormatForLayer(
+                  layers[i].conf().getLayer());
+              if (preLayerFormat != currLayerFormat) {
+                //NHWC case
+                if (preLayerFormat == CNN2DFormat.NCHW) {
+                  input = input.permute(0, 3, 1, 2);
+                }
+                //NCHW case
+                else if (preLayerFormat == CNN2DFormat.NHWC) {
+                  input = input.permute(0, 2, 3, 1);
+
+                } else {
+                  throw new IllegalStateException(
+                      "No CNN2DDataFormat type found for previous layer!");
+                }
+              }
+
+              input = layers[i].activate(input, train, mgr);
+            } else if (i > 0 && Convolution1DUtils.hasRnnDataFormat(layers[i - 1].conf().getLayer())
+                && Convolution1DUtils.hasRnnDataFormat(layers[i].conf().getLayer())) {
+              RNNFormat preLayerFormat = Convolution1DUtils.getRnnFormatFromLayer(
+                  layers[i - 1].conf().getLayer());
+              RNNFormat currLayerFormat = Convolution1DUtils.getRnnFormatFromLayer(
+                  layers[i].conf().getLayer());
+              //permute for next layer
+              if (preLayerFormat != currLayerFormat) {
+                input = input.permute(0, 2, 1);
+              }
+
+              input = layers[i].activate(input, train, mgr);
+
+
+            } else {
+              input = layers[i].activate(input, train, mgr);
+            }
+          } else if (fwdPassType == FwdPassType.RNN_TIMESTEP) {
+            //rnnTimeStep case
+            if (layers[i] instanceof RecurrentLayer) {
+              input = ((RecurrentLayer) layers[i]).rnnTimeStep(reshapeTimeStepInput(input), mgr);
+            } else if (layers[i] instanceof BaseWrapperLayer
+                && ((BaseWrapperLayer) layers[i]).getUnderlying() instanceof RecurrentLayer) {
+              RecurrentLayer rl = ((RecurrentLayer) ((BaseWrapperLayer) layers[i]).getUnderlying());
+              input = rl.rnnTimeStep(reshapeTimeStepInput(input), mgr);
+            } else if (layers[i] instanceof MultiLayerNetwork) {
+              input = ((MultiLayerNetwork) layers[i]).rnnTimeStep(reshapeTimeStepInput(input));
+            } else {
+              input = layers[i].activate(input, false, mgr);
+            }
+          } else {
+            throw new IllegalArgumentException(
+                "Unsupported forward pass type for this method: " + fwdPassType);
+          }
+          layers[i].clear();
+          //Validation: Exception if invalid (bad layer implementation)
+          validateArrayWorkspaces(mgr, input, ArrayType.ACTIVATIONS, i, false,
+              "Output of layer (inference)");
+
+          if (wsActCloseNext != null) {
+            wsActCloseNext.close();
+          }
+          wsActCloseNext = temp;
+          temp = null;
         }
-        mgrEven.setHelperWorkspacePointers(helperWorkspaces);
-        mgrOdd.setHelperWorkspacePointers(helperWorkspaces);
 
-        MemoryWorkspace wsActCloseNext = null;
-        MemoryWorkspace temp = null;
-        MemoryWorkspace initialWorkspace = Nd4j.getMemoryManager().getCurrentWorkspace();
+        if (traceLog) {
+          log.trace("Completed forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
+        }
 
-        boolean traceLog = log.isTraceEnabled();
-
-        Throwable t = null;
+        //Edge case: for first layer with dropout, inputs can't be in previous workspace (as it hasn't been opened yet)
+        //Hence: put inputs in working memory -> set back to default for next use of workspace mgr
+        if (i == 0 && wsm != WorkspaceMode.NONE) {
+          mgr.setWorkspace(ArrayType.INPUT, WS_LAYER_ACT_2,
+              WS_LAYER_ACT_X_CONFIG);            //Inputs should always be in the previous WS
+        }
+      }
+    } catch (Throwable t2) {
+      t = t2;
+    } finally {
+      if (wsActCloseNext != null) {
         try {
-            for (int i = 0; i <= layerIndex; i++) {
-                LayerWorkspaceMgr mgr = (i % 2 == 0 ? mgrEven : mgrOdd);
-
-                if (traceLog) {
-                    log.trace("About to forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-
-                //Edge case: for first layer with dropout, inputs can't be in previous workspace (as it hasn't been opened yet)
-                //Hence: put inputs in working memory
-                if (i == 0 && wsm != WorkspaceMode.NONE) {
-                    mgr.setWorkspace(ArrayType.INPUT, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG);
-                }
-
-                try (MemoryWorkspace wsFFWorking = mgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) { //Working memory: opened/closed once per layer
-                    //Activations workspaces: opened/closed every second layer.
-                    //So mgrEven (WS_LAYER_ACT_1) open at start of 0, 2, 4, 8; closed at end of 1, 3, 5, 7 etc
-                    //and mgrOdd (WS_LAYER_ACT_2) opened at start of 1, 3, 5, 7; closed at end of 2, 4, 6, 8 etc
-                    temp = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS);
-
-                    //Note that because we're opening activation workspaces not in a simple nested order, we'll manually
-                    // override the previous workspace setting. Otherwise, when we close these workspaces, the "current"
-                    // workspace may be set to the incorrect one
-                    temp.setPreviousWorkspace(initialWorkspace);
-
-
-                    if (i == 0 && input.isAttached()) {
-                        //Don't leverage out of async DataSetIterator workspaces
-                        mgr.setNoLeverageOverride(input.data().getParentWorkspace().getId());
-                    }
-
-                    if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
-                        input = getLayerWiseConfigurations().getInputPreProcess(i).preProcess(input, getInputMiniBatchSize(), mgr);
-                        //Validation: Exception if invalid (bad preprocessor implementation)
-                        validateArrayWorkspaces(mgr, input, ArrayType.ACTIVATIONS, i, true, "Output of layer (inference)");
-                    }
-
-                    if (i == layerIndex) {
-                        if (outputWorkspace != null && !(outputWorkspace instanceof DummyWorkspace)) {
-                            //Place activations in user-specified workspace
-                            mgr.setWorkspace(ArrayType.ACTIVATIONS, outputWorkspace.getId(), outputWorkspace.getWorkspaceConfiguration());
-                        } else {
-                            //Final activations: should be detached
-                            mgr.setScopedOutFor(ArrayType.ACTIVATIONS);
-                        }
-                    }
-
-                    if (fwdPassType == FwdPassType.STANDARD) {
-                        //Standard feed-forward case
-                        if(i > 0 && ConvolutionUtils.layerHasConvolutionLayout(layers[i - 1].conf().getLayer())
-                                && ConvolutionUtils.layerHasConvolutionLayout(layers[i].conf().getLayer())) {
-
-                            CNN2DFormat preLayerFormat = ConvolutionUtils.getFormatForLayer(layers[i - 1].conf().getLayer());
-                            CNN2DFormat currLayerFormat = ConvolutionUtils.getFormatForLayer(layers[i].conf().getLayer());
-                            if(preLayerFormat != currLayerFormat) {
-                                //NHWC case
-                                if(preLayerFormat == CNN2DFormat.NCHW) {
-                                    input = input.permute(0,3,1,2);
-                                }
-                                //NCHW case
-                                else if(preLayerFormat == CNN2DFormat.NHWC) {
-                                    input = input.permute(0,2,3,1);
-
-                                }
-                                else
-                                    throw new IllegalStateException("No CNN2DDataFormat type found for previous layer!");
-                            }
-
-                            input = layers[i].activate(input, train, mgr);
-                        } else if(i > 0 && Convolution1DUtils.hasRnnDataFormat(layers[i - 1].conf().getLayer())
-                                && Convolution1DUtils.hasRnnDataFormat(layers[i].conf().getLayer())) {
-                            RNNFormat preLayerFormat = Convolution1DUtils.getRnnFormatFromLayer(layers[i - 1].conf().getLayer());
-                            RNNFormat currLayerFormat = Convolution1DUtils.getRnnFormatFromLayer(layers[i].conf().getLayer());
-                            //permute for next layer
-                            if(preLayerFormat != currLayerFormat) {
-                                input = input.permute(0,2,1);
-                            }
-
-                            input = layers[i].activate(input, train, mgr);
-
-
-                        } else
-                            input = layers[i].activate(input, train, mgr);
-                    } else if (fwdPassType == FwdPassType.RNN_TIMESTEP) {
-                        //rnnTimeStep case
-                        if (layers[i] instanceof RecurrentLayer) {
-                            input = ((RecurrentLayer) layers[i]).rnnTimeStep(reshapeTimeStepInput(input), mgr);
-                        } else if (layers[i] instanceof BaseWrapperLayer && ((BaseWrapperLayer) layers[i]).getUnderlying() instanceof RecurrentLayer) {
-                            RecurrentLayer rl = ((RecurrentLayer) ((BaseWrapperLayer) layers[i]).getUnderlying());
-                            input = rl.rnnTimeStep(reshapeTimeStepInput(input), mgr);
-                        } else if (layers[i] instanceof MultiLayerNetwork) {
-                            input = ((MultiLayerNetwork) layers[i]).rnnTimeStep(reshapeTimeStepInput(input));
-                        } else {
-                            input = layers[i].activate(input, false, mgr);
-                        }
-                    } else {
-                        throw new IllegalArgumentException("Unsupported forward pass type for this method: " + fwdPassType);
-                    }
-                    layers[i].clear();
-                    //Validation: Exception if invalid (bad layer implementation)
-                    validateArrayWorkspaces(mgr, input, ArrayType.ACTIVATIONS, i, false, "Output of layer (inference)");
-
-                    if (wsActCloseNext != null) {
-                        wsActCloseNext.close();
-                    }
-                    wsActCloseNext = temp;
-                    temp = null;
-                }
-
-                if (traceLog) {
-                    log.trace("Completed forward pass: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-
-                //Edge case: for first layer with dropout, inputs can't be in previous workspace (as it hasn't been opened yet)
-                //Hence: put inputs in working memory -> set back to default for next use of workspace mgr
-                if (i == 0 && wsm != WorkspaceMode.NONE) {
-                    mgr.setWorkspace(ArrayType.INPUT, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG);            //Inputs should always be in the previous WS
-                }
-            }
-        } catch (Throwable t2){
-            t = t2;
-        } finally {
-            if(wsActCloseNext != null){
-                try {
-                    wsActCloseNext.close();
-                } catch (Throwable t2){
-                    if(t != null){
-                        log.error("Encountered second exception while trying to close workspace after initial exception");
-                        log.error("Original exception:", t);
-                        throw t2;
-                    }
-                }
-            }
-            if(temp != null){
-                //Should only be non-null on exception
-                while(temp.isScopeActive()){
-                    //For safety, should never occur in theory: a single close() call may not be sufficient, if
-                    // workspace scope was borrowed and not properly closed when exception occurred
-                    try{
-                        temp.close();
-                    } catch (Throwable t2){
-                        if(t != null){
-                            log.error("Encountered second exception while trying to close workspace after initial exception");
-                            log.error("Original exception:", t);
-                            throw t2;
-                        }
-                    }
-                }
+          wsActCloseNext.close();
+        } catch (Throwable t2) {
+          if (t != null) {
+            log.error(
+                "Encountered second exception while trying to close workspace after initial exception");
+            log.error("Original exception:", t);
+            throw t2;
+          }
+        }
+      }
+      if (temp != null) {
+        //Should only be non-null on exception
+        while (temp.isScopeActive()) {
+          //For safety, should never occur in theory: a single close() call may not be sufficient, if
+          // workspace scope was borrowed and not properly closed when exception occurred
+          try {
+            temp.close();
+          } catch (Throwable t2) {
+            if (t != null) {
+              log.error(
+                  "Encountered second exception while trying to close workspace after initial exception");
+              log.error("Original exception:", t);
+              throw t2;
             }
+          }
+        }
+      }
 
-            Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
+      Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
 
-            if(t != null){
-                if(t instanceof RuntimeException){
-                    throw ((RuntimeException)t);
-                }
-                throw new RuntimeException("Error during neural network forward pass", t);
-            }
+      if (t != null) {
+        if (t instanceof RuntimeException) {
+          throw ((RuntimeException) t);
+        }
+        throw new RuntimeException("Error during neural network forward pass", t);
+      }
 
-            if(outputWorkspace == null || outputWorkspace instanceof DummyWorkspace) {
-                WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active at the end of outputOfLayerDetached", true);
-            } else {
-                Preconditions.checkState(outputWorkspace.isScopeActive(), "Expected output workspace to still be open" +
-                        "at end of outputOfLayerDetached, but it is closed. This suggests an implementation or layer workspace problem");
-            }
+      if (outputWorkspace == null || outputWorkspace instanceof DummyWorkspace) {
+        WorkspaceUtils.assertNoWorkspacesOpen(
+            "Expected no workspace active at the end of outputOfLayerDetached", true);
+      } else {
+        Preconditions.checkState(outputWorkspace.isScopeActive(),
+            "Expected output workspace to still be open" +
+                "at end of outputOfLayerDetached, but it is closed. This suggests an implementation or layer workspace problem");
+      }
+    }
+
+    return input;
+  }
+
+  private INDArray reshapeTimeStepInput(INDArray input) {
+    if (input.rank() == 2) { // dynamically reshape to 3D input with one time-step.
+      long[] inShape = input.shape();
+      input = input.reshape(inShape[0], inShape[1], 1);
+    }
+    return input;
+  }
+
+  /**
+   * Compute activations of all layers from input (inclusive) to output of the final/output layer.
+   * Equivalent to calling {@link #feedForward(boolean)} with train=false
+   *
+   * @return the list of activations for each layer, including the input
+   */
+  public List<INDArray> feedForward() {
+    return feedForward(false);
+  }
+
+  /**
+   * Compute activations of all layers from input (inclusive) to output of the final/output layer.
+   * Equivalent to calling {@link #feedForward(INDArray, boolean)} with train = false
+   *
+   * @return the list of activations for each layer, including the input
+   */
+  public List<INDArray> feedForward(INDArray input) {
+    if (input == null) {
+      throw new IllegalStateException("Unable to perform feed forward; no input found");
+    }
+    setInput(input);
+    return feedForward();
+  }
+
+  /**
+   * Compute the activations from the input to the output layer, given mask arrays (that may be
+   * null) The masking arrays are used in situations such an one-to-many and many-to-one rucerrent
+   * neural network (RNN) designs, as well as for supporting time series of varying lengths within
+   * the same minibatch for RNNs. Other than mask arrays, this is equivalent to calling
+   * {@link #feedForward(INDArray, boolean)} with train = false
+   */
+  public List<INDArray> feedForward(INDArray input, INDArray featuresMask, INDArray labelsMask) {
+    setLayerMaskArrays(featuresMask, labelsMask);
+    List<INDArray> list = feedForward(input);
+    clearLayerMaskArrays();
+    return list;
+  }
+
+  @Override
+  public Gradient gradient() {
+    return gradient;
+  }
+
+  @Override
+  public Pair<Gradient, Double> gradientAndScore() {
+    return new Pair<>(gradient(), score());
+  }
+
+  /**
+   * Clone the MultiLayerNetwork
+   *
+   * @return A cloned MultiLayerNetwork with a copy of the configuration, parameters and updater
+   * identical to the current network.
+   */
+  @Override
+  public MultiLayerNetwork clone() {
+    if (!initCalled) {
+      init();
+    }
+    MultiLayerConfiguration conf = this.layerWiseConfigurations.clone();
+    MultiLayerNetwork ret = new MultiLayerNetwork(conf);
+    ret.init(this.params().dup(), false);
+
+    if (solver != null) {
+      //If  solver is null: updater hasn't been initialized -> getUpdater call will force initialization, however
+      Updater u = this.getUpdater();
+      INDArray updaterState = u.getStateViewArray();
+      if (updaterState != null) {
+        ret.getUpdater().setStateViewArray(ret, updaterState.dup(), false);
+      }
+    }
+
+    if (hasAFrozenLayer()) {
+      //correct layers to frozen layers
+      Layer[] clonedLayers = ret.getLayers();
+      for (int i = 0; i < layers.length; i++) {
+        if (layers[i] instanceof FrozenLayer) {
+          clonedLayers[i] = new FrozenLayer(ret.getLayer(i));
+        }
+      }
+      ret.setLayers(clonedLayers);
+    }
+    return ret;
+  }
+
+  protected boolean hasAFrozenLayer() {
+    for (int i = 0; i < layers.length - 1; i++) {
+      if (layers[i] instanceof FrozenLayer) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * @deprecated To be removed. Use {@link #params()} instead
+   */
+  @Deprecated
+  public INDArray params(boolean backwardOnly) {
+    return params();
+  }
+
+  /**
+   * Returns a 1 x m vector where the vector is composed of a flattened vector of all of the
+   * parameters in the network.<br> See {@link #getParam(String)} and {@link #paramTable()} for a
+   * more useful/interpretable representation of the parameters.<br> Note that the parameter vector
+   * is not a copy, and changes to the returned INDArray will impact the network parameters.
+   *
+   * @return the parameters for this neural net
+   */
+  @Override
+  public INDArray params() {
+    return flattenedParams;
+  }
+
+  /**
+   * Set the parameters for this model. This expects a linear ndarray which then be unpacked
+   * internally relative to the expected ordering of the model.<br> See also:
+   * {@link #setParamTable(Map)} and {@link #setParam(String, INDArray)}
+   *
+   * @param params the parameters for the model
+   */
+  @Override
+  public void setParams(INDArray params) {
+    if (flattenedParams == params) {
+      return; //No op
+    }
+
+    if (flattenedParams != null && params.length() == flattenedParams.length()) {
+      if (params != flattenedParams) {
+        flattenedParams.assign(params);
+      }
+    } else {
+      if (flattenedParams == null) {
+        flattenedParams = params.dup();
+      }
+      int idx = 0;
+      for (int i = 0; i < getLayers().length; i++) {
+        Layer layer = getLayer(i);
+        long range = layer.numParams();
+        if (range <= 0) {
+          continue; //Some layers: no parameters (subsampling, etc)
+        }
+        INDArray get = params.get(NDArrayIndex.interval(0, 0, true),
+            NDArrayIndex.interval(idx, range + idx));
+        layer.setParams(get);
+        idx += range;
+      }
+    }
+  }
+
+  @Override
+  public void setParamsViewArray(INDArray params) {
+    throw new UnsupportedOperationException("Not yet implemented");
+  }
+
+  @Override
+  public INDArray getGradientsViewArray() {
+    return flattenedGradients;
+  }
+
+  @Override
+  public void setBackpropGradientsViewArray(INDArray gradients) {
+    int paramsSoFar = 0;
+    for (Layer layer : layers) {
+      if (layer.numParams() == 0) {
+        continue;
+      }
+      layer.setBackpropGradientsViewArray(gradients.get(NDArrayIndex.interval(0, 0, true),
+          NDArrayIndex.interval(paramsSoFar, paramsSoFar + layer.numParams())));
+      paramsSoFar += layer.numParams();
+    }
+  }
+
+  @Override
+  public TrainingConfig getConfig() {
+    throw new UnsupportedOperationException("Not supported");
+  }
+
+  /**
+   * Returns the number of parameters in the network
+   *
+   * @return The number of parameters
+   */
+  @Override
+  public long numParams() {
+    if (!isInitCalled()) {
+      init();
+    }
+    return flattenedParams == null ? 0 : flattenedParams.length();  //Maybe nul for 0 params net
+  }
+
+  /**
+   * Returns the number of parameters in the network
+   *
+   * @param backwards If true: exclude any parameters uned only in unsupervised layerwise training
+   *                  (such as the decoder parameters in an autoencoder)
+   * @return The number of parameters
+   */
+  @Override
+  public long numParams(boolean backwards) {
+    int length = 0;
+    for (int i = 0; i < layers.length; i++) {
+      length += layers[i].numParams(backwards);
+    }
+
+    return length;
+  }
+
+  /**
+   * Sets the input and labels and returns the F1 score for the prediction with respect to the true
+   * labels
+   *
+   * @param data the data to score
+   * @return the score for the given input,label pairs
+   */
+  @Override
+  public double f1Score(org.nd4j.linalg.dataset.api.DataSet data) {
+    return f1Score(data.getFeatures(), data.getLabels());
+  }
+
+  /**
+   * Perform minibatch training on all minibatches in the DataSetIterator, for the specified number
+   * of epochs. Equvalent to calling {@link #fit(DataSetIterator)} numEpochs times in a loop
+   *
+   * @param iterator  Training data (DataSetIterator). Iterator must support resetting
+   * @param numEpochs Number of training epochs, >= 1
+   */
+  public void fit(@NonNull DataSetIterator iterator, int numEpochs) {
+    Preconditions.checkArgument(numEpochs > 0, "Number of epochs much be > 0. Got numEpochs = %s",
+        numEpochs);
+    Preconditions.checkArgument(numEpochs == 1 || iterator.resetSupported(),
+        "Cannot perform multiple epochs training using" +
+            "iterator thas does not support resetting (iterator.resetSupported() returned false)");
+
+    for (int i = 0; i < numEpochs; i++) {
+      fit(iterator);
+    }
+  }
+
+  /**
+   * Perform minibatch training on all minibatches in the DataSetIterator for 1 epoch.<br> Note that
+   * this method does not do layerwise  pretraining.<br> For pretraining use method pretrain..
+   * {@link #pretrain(DataSetIterator)}<br>
+   *
+   * @param iterator Training data (DataSetIterator)
+   */
+  @Override
+  public void fit(DataSetIterator iterator) {
+    try {
+      fitHelper(iterator);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  private synchronized void fitHelper(DataSetIterator iterator) {
+    // we're wrapping all iterators into AsyncDataSetIterator to provide background prefetch - where appropriate
+    DataSetIterator iter;
+    boolean destructable = false;
+    if (iterator.asyncSupported()) {
+      iter = new AsyncDataSetIterator(iterator,
+          Math.min(Nd4j.getAffinityManager().getNumberOfDevices() * 2, 2), true);
+      destructable = true;
+    } else {
+      iter = iterator;
+    }
+
+    for (TrainingListener tl : trainingListeners) {
+      tl.onEpochStart(this);
+    }
+
+    LayerWorkspaceMgr workspaceMgr;
+    if (getLayerWiseConfigurations().getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      workspaceMgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          //Note for updater working memory, we have the option to re-use WS_ALL_LAYERS_ACT or FF/BP_WORKING_MEM
+          // as these should be closed by the time updaters are executed
+          //Generally, WS_ALL_LAYERS_ACT will be the larger of the two, so we'll use this
+          .with(ArrayType.UPDATER_WORKING_MEM, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .build();
+    }
+    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    update(TaskUtils.buildTask(iter));
+    if (!iter.hasNext() && iter.resetSupported()) {
+      iter.reset();
+    }
+    long time1 = System.currentTimeMillis();
+    while (iter.hasNext()) {
+
+      DataSet next = iter.next();
+      long time2 = System.currentTimeMillis();
+
+      lastEtlTime.set((time2 - time1));
+
+      if (next.getFeatures() == null || next.getLabels() == null) {
+        break;
+      }
+
+      // TODO: basically we want to wrap internals of this loop into workspace
+
+      boolean hasMaskArrays = next.hasMaskArrays();
+
+      if (layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT) {
+        doTruncatedBPTT(next.getFeatures(), next.getLabels(), next.getFeaturesMaskArray(),
+            next.getLabelsMaskArray(), workspaceMgr);
+      } else {
+        if (hasMaskArrays) {
+          setLayerMaskArrays(next.getFeaturesMaskArray(), next.getLabelsMaskArray());
         }
 
-        return input;
-    }
+        setInput(next.getFeatures());
+        setLabels(next.getLabels());
 
-    private INDArray reshapeTimeStepInput(INDArray input) {
-        if (input.rank() == 2) { // dynamically reshape to 3D input with one time-step.
-            long[] inShape = input.shape();
-            input = input.reshape(inShape[0], inShape[1], 1);
+        if (solver == null) {
+          try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+            solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
+                .build();
+          }
         }
-        return input;
-    }
 
-    /**
-     * Compute activations of all layers from input (inclusive) to output of the final/output layer.
-     * Equivalent to calling {@link #feedForward(boolean)} with train=false
-     *
-     * @return the list of activations for each layer, including the input
-     */
-    public List<INDArray> feedForward() {
-        return feedForward(false);
-    }
+        //TODO CACHE
+        solver.optimize(workspaceMgr);
+      }
 
-    /**
-     * Compute activations of all layers from input (inclusive) to output of the final/output layer.
-     * Equivalent to calling {@link #feedForward(INDArray, boolean)} with train = false
-     *
-     * @return the list of activations for each layer, including the input
-     */
-    public List<INDArray> feedForward(INDArray input) {
-        if (input == null)
-            throw new IllegalStateException("Unable to perform feed forward; no input found");
-        setInput(input);
-        return feedForward();
-    }
-
-    /**
-     * Compute the activations from the input to the output layer, given mask arrays (that may be null)
-     * The masking arrays are used in situations such an one-to-many and many-to-one rucerrent neural network (RNN)
-     * designs, as well as for supporting time series of varying lengths within the same minibatch for RNNs.
-     * Other than mask arrays, this is equivalent to calling {@link #feedForward(INDArray, boolean)} with train = false
-     */
-    public List<INDArray> feedForward(INDArray input, INDArray featuresMask, INDArray labelsMask) {
-        setLayerMaskArrays(featuresMask, labelsMask);
-        List<INDArray> list = feedForward(input);
+      if (hasMaskArrays) {
         clearLayerMaskArrays();
-        return list;
+      }
+
+      time1 = System.currentTimeMillis();
+      synchronizeIterEpochCounts();
     }
 
-
-    @Override
-    public Gradient gradient() {
-        return gradient;
+    if (!trainingListeners.isEmpty()) {
+      for (TrainingListener tl : trainingListeners) {
+        tl.onEpochEnd(this);
+      }
     }
 
-    @Override
-    public Pair<Gradient, Double> gradientAndScore() {
-        return new Pair<>(gradient(), score());
+    clearLayersStates();
+
+    if (destructable) {
+      ((AsyncDataSetIterator) iter).shutdown();
     }
 
+    incrementEpochCount();
+  }
 
-    /**
-     * Clone the MultiLayerNetwork
-     * @return A cloned MultiLayerNetwork with a copy of the configuration, parameters and updater identical to the current network.
-     */
-    @Override
-    public MultiLayerNetwork clone() {
-        if(!initCalled)
-            init();
-        MultiLayerConfiguration conf = this.layerWiseConfigurations.clone();
-        MultiLayerNetwork ret = new MultiLayerNetwork(conf);
-        ret.init(this.params().dup(), false);
-
-        if (solver != null) {
-            //If  solver is null: updater hasn't been initialized -> getUpdater call will force initialization, however
-            Updater u = this.getUpdater();
-            INDArray updaterState = u.getStateViewArray();
-            if (updaterState != null) {
-                ret.getUpdater().setStateViewArray(ret, updaterState.dup(), false);
-            }
-        }
-
-        if (hasAFrozenLayer()) {
-            //correct layers to frozen layers
-            Layer[] clonedLayers = ret.getLayers();
-            for (int i = 0; i < layers.length; i++) {
-                if (layers[i] instanceof FrozenLayer) {
-                    clonedLayers[i] = new FrozenLayer(ret.getLayer(i));
-                }
-            }
-            ret.setLayers(clonedLayers);
-        }
-        return ret;
+  /**
+   * Calculate parameter gradients and input activation gradients given the input and labels, and
+   * optionally mask arrays
+   *
+   * @param features  Features for gradient calculation
+   * @param label     Labels for gradient
+   * @param fMask     Features mask array (may be null)
+   * @param labelMask Label mask array (may be null)
+   * @return A pair of gradient arrays: parameter gradients (in Gradient object) and input
+   * activation gradients
+   */
+  public Pair<Gradient, INDArray> calculateGradients(@NonNull INDArray features,
+      @NonNull INDArray label,
+      INDArray fMask, INDArray labelMask) {
+    try {
+      return calculateGradientsHelper(features, label, fMask, labelMask);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
     }
+  }
 
-    protected boolean hasAFrozenLayer() {
-        for (int i = 0; i < layers.length - 1; i++) {
-            if (layers[i] instanceof FrozenLayer)
-                return true;
-        }
-        return false;
+  private Pair<Gradient, INDArray> calculateGradientsHelper(INDArray features, INDArray label,
+      INDArray fMask,
+      INDArray labelMask) {
+    setInput(features);
+    setLabels(label);
+    setLayerMaskArrays(fMask, labelMask);
+
+    LayerWorkspaceMgr mgr;
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      mgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      mgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+
+      if (layerWiseConfigurations.getCacheMode() != null) {
+        //For now: store cache mode activations in activations workspace
+        mgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
+      }
     }
+    mgr.setHelperWorkspacePointers(helperWorkspaces);
 
-
-    /**
-     * @deprecated To be removed. Use {@link #params()} instead
-     */
-    @Deprecated
-    public INDArray params(boolean backwardOnly) {
-        return params();
-    }
-
-
-    /**
-     * Returns a 1 x m vector where the vector is composed of a flattened vector of all of the parameters in the network.<br>
-     * See {@link #getParam(String)} and {@link #paramTable()} for a more useful/interpretable representation of the parameters.<br>
-     * Note that the parameter vector is not a copy, and changes to the returned INDArray will impact the network parameters.
-     *
-     * @return the parameters for this neural net
-     */
-    @Override
-    public INDArray params() {
-        return flattenedParams;
-    }
-
-    /**
-     * Set the parameters for this model.
-     * This expects a linear ndarray which then be unpacked internally relative to the expected ordering of the model.<br>
-     * See also: {@link #setParamTable(Map)} and {@link #setParam(String, INDArray)}
-     *
-     * @param params the parameters for the model
-     */
-    @Override
-    public void setParams(INDArray params) {
-        if (flattenedParams == params) {
-            return; //No op
-        }
-
-        if (flattenedParams != null && params.length() == flattenedParams.length()) {
-            if (params != flattenedParams) {
-                flattenedParams.assign(params);
-            }
-        } else {
-            if (flattenedParams == null)
-                flattenedParams = params.dup();
-            int idx = 0;
-            for (int i = 0; i < getLayers().length; i++) {
-                Layer layer = getLayer(i);
-                long range = layer.numParams();
-                if (range <= 0)
-                    continue; //Some layers: no parameters (subsampling, etc)
-                INDArray get = params.get(NDArrayIndex.interval(0,0,true), NDArrayIndex.interval(idx, range + idx));
-                layer.setParams(get);
-                idx += range;
-            }
-        }
-    }
-
-    @Override
-    public void setParamsViewArray(INDArray params) {
-        throw new UnsupportedOperationException("Not yet implemented");
-    }
-
-    @Override
-    public INDArray getGradientsViewArray() {
-        return flattenedGradients;
-    }
-
-    @Override
-    public void setBackpropGradientsViewArray(INDArray gradients) {
-        int paramsSoFar = 0;
-        for (Layer layer : layers) {
-            if (layer.numParams() == 0)
-                continue;
-            layer.setBackpropGradientsViewArray(gradients.get(NDArrayIndex.interval(0,0,true),
-                    NDArrayIndex.interval(paramsSoFar, paramsSoFar + layer.numParams())));
-            paramsSoFar += layer.numParams();
-        }
-    }
-
-    @Override
-    public TrainingConfig getConfig() {
-        throw new UnsupportedOperationException("Not supported");
-    }
-
-    /**
-     * Returns the number of parameters in the network
-     *
-     * @return The number of parameters
-     */
-    @Override
-    public long numParams() {
-        if(!isInitCalled())
-            init();
-        return flattenedParams == null ? 0 : flattenedParams.length();  //Maybe nul for 0 params net
-    }
-
-    /**
-     * Returns the number of parameters in the network
-     *
-     * @param  backwards If true: exclude any parameters uned only in unsupervised layerwise training (such as the decoder
-     *                   parameters in an autoencoder)
-     * @return The number of parameters
-     */
-    @Override
-    public long numParams(boolean backwards) {
-        int length = 0;
-        for (int i = 0; i < layers.length; i++)
-            length += layers[i].numParams(backwards);
-
-        return length;
-    }
-
-    /**
-     * Sets the input and labels and returns the F1 score for the prediction with respect to the true labels
-     *
-     * @param data the data to score
-     * @return the score for the given input,label pairs
-     */
-    @Override
-    public double f1Score(org.nd4j.linalg.dataset.api.DataSet data) {
-        return f1Score(data.getFeatures(), data.getLabels());
-    }
-
-    /**
-     * Perform minibatch training on all minibatches in the DataSetIterator, for the specified number of epochs.
-     * Equvalent to calling {@link #fit(DataSetIterator)} numEpochs times in a loop
-     *
-     * @param iterator  Training data (DataSetIterator). Iterator must support resetting
-     * @param numEpochs Number of training epochs, >= 1
-     */
-    public void fit(@NonNull DataSetIterator iterator, int numEpochs){
-        Preconditions.checkArgument(numEpochs > 0, "Number of epochs much be > 0. Got numEpochs = %s", numEpochs);
-        Preconditions.checkArgument(numEpochs == 1 || iterator.resetSupported(), "Cannot perform multiple epochs training using" +
-                "iterator thas does not support resetting (iterator.resetSupported() returned false)");
-
-        for(int i=0; i<numEpochs; i++ ){
-            fit(iterator);
-        }
-    }
-
-    /**
-     * Perform minibatch training on all minibatches in the DataSetIterator for 1 epoch.<br>
-     * Note that this method does not do layerwise  pretraining.<br>
-     * For pretraining use method pretrain.. {@link #pretrain(DataSetIterator)}<br>
-     * @param iterator Training data (DataSetIterator)
-     */
-    @Override
-    public void fit(DataSetIterator iterator) {
-        try{
-            fitHelper(iterator);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    private synchronized void fitHelper(DataSetIterator iterator){
-        // we're wrapping all iterators into AsyncDataSetIterator to provide background prefetch - where appropriate
-        DataSetIterator iter;
-        boolean destructable = false;
-        if (iterator.asyncSupported()) {
-            iter = new AsyncDataSetIterator(iterator, Math.min(Nd4j.getAffinityManager().getNumberOfDevices() * 2, 2), true);
-            destructable = true;
-        } else {
-            iter = iterator;
-        }
-
+    //Calculate activations (which are stored in each layer, and used in backprop)
+    try (MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS)) {
+      //First: do a feed-forward through the network
+      //Note that we don't actually need to do the full forward pass through the output layer right now; but we do
+      // need the input to the output layer to be set (such that backprop can be done)
+      List<INDArray> activations = ffToLayerActivationsInWs(layers.length - 2, FwdPassType.STANDARD,
+          false, input, mask, fMask);
+      if (!trainingListeners.isEmpty()) {
+        //TODO: We possibly do want output layer activations in some cases here...
         for (TrainingListener tl : trainingListeners) {
-            tl.onEpochStart(this);
+          tl.onForwardPass(this, activations);
         }
+      }
+      INDArray inputToOutputLayer = activations.get(activations.size() - 1);
+      if (layerWiseConfigurations.getInputPreProcess(layers.length - 1) != null) {
+        inputToOutputLayer = layerWiseConfigurations.getInputPreProcess(layers.length - 1)
+            .preProcess(inputToOutputLayer, getInputMiniBatchSize(), mgr);
+        //Validate activations location
+      }
+      getOutputLayer().setInput(inputToOutputLayer, mgr);
 
-        LayerWorkspaceMgr workspaceMgr;
-        if(getLayerWiseConfigurations().getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            workspaceMgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    //Note for updater working memory, we have the option to re-use WS_ALL_LAYERS_ACT or FF/BP_WORKING_MEM
-                    // as these should be closed by the time updaters are executed
-                    //Generally, WS_ALL_LAYERS_ACT will be the larger of the two, so we'll use this
-                    .with(ArrayType.UPDATER_WORKING_MEM, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .build();
-        }
-        workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        update(TaskUtils.buildTask(iter));
-        if (!iter.hasNext() && iter.resetSupported()) {
-            iter.reset();
-        }
-        long time1 = System.currentTimeMillis();
-        while (iter.hasNext()) {
-
-            DataSet next = iter.next();
-            long time2 = System.currentTimeMillis();
-
-            lastEtlTime.set((time2 - time1));
-
-            if (next.getFeatures() == null || next.getLabels() == null)
-                break;
-
-            // TODO: basically we want to wrap internals of this loop into workspace
-
-
-            boolean hasMaskArrays = next.hasMaskArrays();
-
-            if (layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT) {
-                doTruncatedBPTT(next.getFeatures(), next.getLabels(), next.getFeaturesMaskArray(),
-                        next.getLabelsMaskArray(), workspaceMgr);
-            } else {
-                if (hasMaskArrays)
-                    setLayerMaskArrays(next.getFeaturesMaskArray(), next.getLabelsMaskArray());
-
-                setInput(next.getFeatures());
-                setLabels(next.getLabels());
-
-                if (solver == null) {
-                    try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                        solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
-                                .build();
-                    }
-                }
-
-                //TODO CACHE
-                solver.optimize(workspaceMgr);
-            }
-
-            if (hasMaskArrays)
-                clearLayerMaskArrays();
-
-            time1 = System.currentTimeMillis();
-            synchronizeIterEpochCounts();
-        }
-
-        if (!trainingListeners.isEmpty()) {
-            for (TrainingListener tl : trainingListeners) {
-                tl.onEpochEnd(this);
-            }
-        }
-
-        clearLayersStates();
-
-        if (destructable)
-            ((AsyncDataSetIterator) iter).shutdown();
-
-        incrementEpochCount();
+      Pair<Gradient, INDArray> p = calcBackpropGradients(null, true, false, true);
+      if (p.getSecond() != null) {
+        p.setSecond(p.getSecond().detach());
+      }
+      return p;
     }
+  }
 
-    /**
-     * Calculate parameter gradients and input activation gradients given the input and labels, and optionally mask arrays
-     *
-     * @param features  Features for gradient calculation
-     * @param label     Labels for gradient
-     * @param fMask     Features mask array (may be null)
-     * @param labelMask Label mask array (may be null)
-     * @return A pair of gradient arrays: parameter gradients (in Gradient object) and input activation gradients
-     */
-    public Pair<Gradient,INDArray> calculateGradients(@NonNull INDArray features, @NonNull INDArray label,
-                                                      INDArray fMask, INDArray labelMask) {
-        try{
-            return calculateGradientsHelper(features, label, fMask, labelMask);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
+  /**
+   * Calculate gradients and errors. Used in two places: (a) backprop (for standard multi layer
+   * network learning) (b) backpropGradient (layer method, for when MultiLayerNetwork is used as a
+   * layer)
+   *
+   * @param epsilon            Errors (technically errors .* activations). Not used if
+   *                           withOutputLayer = true
+   * @param withOutputLayer    if true: assume last layer is output layer, and calculate errors
+   *                           based on labels. In this case, the epsilon input is not used
+   *                           (may/should be null). If false: calculate backprop gradients
+   * @param returnInputActGrad If true: terun the input activation gradients (detached). False:
+   *                           don't return
+   * @return Gradients and the error (epsilon) at the input
+   */
+  protected Pair<Gradient, INDArray> calcBackpropGradients(INDArray epsilon,
+      boolean withOutputLayer, boolean tbptt,
+      boolean returnInputActGrad) {
+    if (flattenedGradients == null) {
+      initGradientsView();
     }
+    String multiGradientKey;
+    Gradient gradient = new DefaultGradient(flattenedGradients);
 
-    private Pair<Gradient,INDArray> calculateGradientsHelper(INDArray features, INDArray label, INDArray fMask,
-                                                             INDArray labelMask){
-        setInput(features);
-        setLabels(label);
-        setLayerMaskArrays(fMask, labelMask);
+    LayerWorkspaceMgr mgrEven;
+    LayerWorkspaceMgr mgrOdd;
 
-        LayerWorkspaceMgr mgr;
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            mgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            mgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
-
-            if(layerWiseConfigurations.getCacheMode() != null){
-                //For now: store cache mode activations in activations workspace
-                mgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
-            }
-        }
-        mgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        //Calculate activations (which are stored in each layer, and used in backprop)
-        try(MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS)) {
-            //First: do a feed-forward through the network
-            //Note that we don't actually need to do the full forward pass through the output layer right now; but we do
-            // need the input to the output layer to be set (such that backprop can be done)
-            List<INDArray> activations = ffToLayerActivationsInWs(layers.length - 2, FwdPassType.STANDARD, false, input, mask, fMask);
-            if (!trainingListeners.isEmpty()) {
-                //TODO: We possibly do want output layer activations in some cases here...
-                for (TrainingListener tl : trainingListeners) {
-                    tl.onForwardPass(this, activations);
-                }
-            }
-            INDArray inputToOutputLayer = activations.get(activations.size() - 1);
-            if (layerWiseConfigurations.getInputPreProcess(layers.length - 1) != null) {
-                inputToOutputLayer = layerWiseConfigurations.getInputPreProcess(layers.length - 1)
-                        .preProcess(inputToOutputLayer, getInputMiniBatchSize(), mgr);
-                //Validate activations location
-            }
-            getOutputLayer().setInput(inputToOutputLayer, mgr);
-
-            Pair<Gradient,INDArray> p = calcBackpropGradients(null, true, false, true);
-            if(p.getSecond() != null){
-                p.setSecond( p.getSecond().detach());
-            }
-            return p;
-        }
-    }
-
-    /** Calculate gradients and errors. Used in two places:
-     * (a) backprop (for standard multi layer network learning)
-     * (b) backpropGradient (layer method, for when MultiLayerNetwork is used as a layer)
-     * @param epsilon Errors (technically errors .* activations). Not used if withOutputLayer = true
-     * @param withOutputLayer if true: assume last layer is output layer, and calculate errors based on labels. In this
-     *                        case, the epsilon input is not used (may/should be null).
-     *                        If false: calculate backprop gradients
-     * @param returnInputActGrad If true: terun the input activation gradients (detached). False: don't return
-     * @return Gradients and the error (epsilon) at the input
-     */
-    protected Pair<Gradient, INDArray> calcBackpropGradients(INDArray epsilon, boolean withOutputLayer, boolean tbptt,
-                                                             boolean returnInputActGrad) {
-        if (flattenedGradients == null) {
-            initGradientsView();
-        }
-        String multiGradientKey;
-        Gradient gradient = new DefaultGradient(flattenedGradients);
-
-        LayerWorkspaceMgr mgrEven;
-        LayerWorkspaceMgr mgrOdd;
-
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            mgrEven = LayerWorkspaceMgr.noWorkspaces();
-            mgrOdd = mgrEven;
-            WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in calcBackpropGradients when " +
-                    "training workspace is set to none");
-        } else {
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      mgrEven = LayerWorkspaceMgr.noWorkspaces();
+      mgrOdd = mgrEven;
+      WorkspaceUtils.assertNoWorkspacesOpen(
+          "Expected no workspace active in calcBackpropGradients when " +
+              "training workspace is set to none");
+    } else {
             /*
             Workspaces for backprop in MLN share some features with outputOfLayerDetached, in terms of the
             "two alternating workspaces" idea (but for activation gradients here, instead of activations there).
@@ -1884,1422 +2081,1546 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, org.d
 
              */
 
-            mgrEven = LayerWorkspaceMgr.builder()
-                    //Activations in context of backprop (preOut methods etc) are not used outside of the layer itself
-                    .with(ArrayType.ACTIVATIONS, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG) //Usually not required here. Exception: OutputLayer dropout
-                    .with(ArrayType.ACTIVATION_GRAD, WS_LAYER_ACT_1, WS_LAYER_ACT_X_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
+      mgrEven = LayerWorkspaceMgr.builder()
+          //Activations in context of backprop (preOut methods etc) are not used outside of the layer itself
+          .with(ArrayType.ACTIVATIONS, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT,
+              WS_ALL_LAYERS_ACT_CONFIG) //Usually not required here. Exception: OutputLayer dropout
+          .with(ArrayType.ACTIVATION_GRAD, WS_LAYER_ACT_1, WS_LAYER_ACT_X_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
 
-            mgrOdd = LayerWorkspaceMgr.builder()
-                    //Activations in context of backprop (preOut methods etc) are not used outside of the layer itself
-                    .with(ArrayType.ACTIVATIONS, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG) //Usually not required here. Exception: OutputLayer dropout
-                    .with(ArrayType.ACTIVATION_GRAD, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
+      mgrOdd = LayerWorkspaceMgr.builder()
+          //Activations in context of backprop (preOut methods etc) are not used outside of the layer itself
+          .with(ArrayType.ACTIVATIONS, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT,
+              WS_ALL_LAYERS_ACT_CONFIG) //Usually not required here. Exception: OutputLayer dropout
+          .with(ArrayType.ACTIVATION_GRAD, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
 
-            if(epsilon == null) {
-                //If epsilon is non-null: external errors use case -> inputs are already detached
-                WorkspaceUtils.assertOpenActiveAndCurrent(WS_ALL_LAYERS_ACT, "calcBackpropGradients method requires workspace WS_ALL_LAYERS_ACT" +
-                        " to be open when workspaces are used");
-            }
-        }
-        mgrEven.setHelperWorkspacePointers(helperWorkspaces);
-        mgrOdd.setHelperWorkspacePointers(helperWorkspaces);
-
-        //calculate and apply the backward gradient for every layer
-        /*
-         * Skip the output layer for the indexing and just loop backwards updating the coefficients for each layer.
-         * (when withOutputLayer == true)
-         *
-         * Activate applies the activation function for each layer and sets that as the input for the following layer.
-         *
-         * Typical literature contains most trivial case for the error calculation: wT * weights
-         * This interpretation transpose a few things to get mini batch because ND4J is rows vs columns organization for params
-         */
-        int numLayers = getnLayers();
-        //Store gradients is a list; used to ensure iteration order in DefaultGradient linked hash map. i.e., layer 0 first instead of output layer
-        LinkedList<Triple<String, INDArray, Character>> gradientList = new LinkedList<>();
-
-
-        Pair<Gradient, INDArray> currPair = null;
-        MemoryWorkspace wsActGradCloseNext = null;
-        MemoryWorkspace wsActGradTemp = null;
-        MemoryWorkspace initialWorkspace = Nd4j.getMemoryManager().getCurrentWorkspace();
-
-        boolean traceLog = log.isTraceEnabled();
-
-        Throwable t = null;
-        try {
-            for (int i = layers.length - 1; i >= 0; i--) {
-                if (layers[i] instanceof FrozenLayer) {
-                    break;
-                }
-
-                if (traceLog) {
-                    log.trace("About to backprop: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-
-                LayerWorkspaceMgr workspaceMgr = (i % 2 == 0 ? mgrEven : mgrOdd);
-
-                if (withOutputLayer && i == layers.length - 1) {
-                    if (!(getOutputLayer() instanceof IOutputLayer)) {
-                        log.warn("Warning: final layer isn't output layer. You cannot use backprop without an output layer.");
-                        return null;
-                    }
-
-                    IOutputLayer outputLayer = (IOutputLayer) getOutputLayer();
-                    if (labels == null && outputLayer.needsLabels())
-                        throw new IllegalStateException("No labels found");
-                    outputLayer.setLabels(labels);
-                }
-
-                //Open activation gradients WS *then* BP working memory, so BP working memory is opened last for use in layers
-                wsActGradTemp = workspaceMgr.notifyScopeEntered(ArrayType.ACTIVATION_GRAD);
-                try (MemoryWorkspace wsBPWorking = workspaceMgr.notifyScopeEntered(ArrayType.BP_WORKING_MEM)) {
-
-                    //Note that because we're opening activation workspaces not in a simple nested order, we'll manually
-                    // override the previous workspace setting. Otherwise, when we close these workspaces, the "current"
-                    // workspace may be set to the incorrect one
-                    wsActGradTemp.setPreviousWorkspace(initialWorkspace);
-                    wsBPWorking.setPreviousWorkspace(initialWorkspace);
-
-                    INDArray eps = (i == layers.length - 1 ? epsilon : currPair.getRight());  //eps is null for OutputLayer
-
-                    if (!tbptt) {
-                        //Standard case
-                        currPair = layers[i].backpropGradient(eps, workspaceMgr);
-                    } else {
-                        //TBPTT gradient
-                        if (layers[i] instanceof RecurrentLayer) {
-                            currPair = ((RecurrentLayer) layers[i]).tbpttBackpropGradient(currPair.getSecond(),
-                                    layerWiseConfigurations.getTbpttBackLength(), workspaceMgr);
-                        } else {
-                            currPair = layers[i].backpropGradient(currPair.getSecond(), workspaceMgr);
-                        }
-                    }
-
-                    if (currPair.getSecond() != null) {
-                        //Edge case: may be null for Embedding layer, for example
-                        validateArrayWorkspaces(workspaceMgr, currPair.getSecond(), ArrayType.ACTIVATION_GRAD, i,
-                                false, "Backprop");
-                    }
-
-                    for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable().entrySet()) {
-                        String origName = entry.getKey();
-                        multiGradientKey = i + "_" + origName;
-                        gradientList.addLast(new Triple<>(multiGradientKey, entry.getValue(),
-                                currPair.getFirst().flatteningOrderForVariable(origName)));
-                    }
-                    if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
-                        currPair = new Pair<>(currPair.getFirst(),
-                                this.layerWiseConfigurations.getInputPreProcess(i)
-                                        .backprop(currPair.getSecond(), getInputMiniBatchSize(), workspaceMgr));
-                        if (i > 0 && currPair.getSecond() != null) {
-                            validateArrayWorkspaces(workspaceMgr, currPair.getSecond(), ArrayType.ACTIVATION_GRAD, i,
-                                    true, "Backprop");
-                        }
-                    }
-
-                    if (i == 0) {
-                        if (returnInputActGrad && currPair.getSecond() != null) {
-                            currPair.setSecond(currPair.getSecond().detach());
-                        } else {
-                            currPair.setSecond(null);
-                        }
-                    }
-
-                    if (wsActGradCloseNext != null) {
-                        wsActGradCloseNext.close();
-                    }
-                    wsActGradCloseNext = wsActGradTemp;
-                    wsActGradTemp = null;
-                }
-
-                if (traceLog) {
-                    log.trace("Completed backprop: {} - {}", i, layers[i].getClass().getSimpleName());
-                }
-            }
-        } catch (Throwable thr ){
-            t = thr;
-        } finally {
-            if(wsActGradCloseNext != null){
-                try {
-                    wsActGradCloseNext.close();
-                } catch (Throwable t2){
-                    if(t != null){
-                        log.error("Encountered second exception while trying to close workspace after initial exception");
-                        log.error("Original exception:", t);
-                        throw t2;
-                    }
-                }
-            }
-            if(wsActGradTemp != null) {
-                //Should only be non-null on exception
-                try {
-                    wsActGradTemp.close();
-                } catch (Throwable t2) {
-                    if (t != null) {
-                        log.error("Encountered second exception while trying to close workspace after initial exception");
-                        log.error("Original exception:", t);
-                        throw t2;
-                    }
-                }
-            }
-            Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
-
-            if(t != null){
-                if(t instanceof RuntimeException){
-                    throw ((RuntimeException)t);
-                }
-                throw new RuntimeException("Error during neural network forward pass", t);
-            }
-        }
-
-        if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
-            WorkspaceUtils.assertNoWorkspacesOpen("Expected no workspace active in calcBackpropGradients when " +
-                    "training workspace is set to none");
-        } else {
-            if(epsilon == null) {
-                //If epsilon != null: external errors use case (inputs are detached instead)
-                WorkspaceUtils.assertOpenActiveAndCurrent(WS_ALL_LAYERS_ACT, "calcBackpropGradients: WS_ALL_LAYERS_ACT is no" +
-                        " longer the currently open/active workspace");
-            }
-        }
-
-        //Add gradients to Gradients (map), in correct order
-        for (Triple<String, INDArray, Character> triple : gradientList) {
-            gradient.setGradientFor(triple.getFirst(), triple.getSecond(), triple.getThird());
-        }
-
-        return new Pair<>(gradient, currPair.getSecond());
+      if (epsilon == null) {
+        //If epsilon is non-null: external errors use case -> inputs are already detached
+        WorkspaceUtils.assertOpenActiveAndCurrent(WS_ALL_LAYERS_ACT,
+            "calcBackpropGradients method requires workspace WS_ALL_LAYERS_ACT" +
+                " to be open when workspaces are used");
+      }
     }
+    mgrEven.setHelperWorkspacePointers(helperWorkspaces);
+    mgrOdd.setHelperWorkspacePointers(helperWorkspaces);
 
-    protected void doTruncatedBPTT(INDArray input, INDArray labels, INDArray featuresMaskArray,
-                                   INDArray labelsMaskArray, LayerWorkspaceMgr workspaceMgr) {
-        if (input.rank() != 3 || labels.rank() != 3) {
-            log.warn("Cannot do truncated BPTT with non-3d inputs or labels. Expect input with shape [miniBatchSize,nIn,timeSeriesLength], got "
-                    + Arrays.toString(input.shape()) + "\tand labels with shape "
-                    + Arrays.toString(labels.shape()));
-            return;
-        }
-        if (input.size(2) != labels.size(2)) {
-            log.warn("Input and label time series have different lengths: {} input length, {} label length",
-                    input.size(2), labels.size(2));
-            return;
-        }
-
-        int fwdLen = layerWiseConfigurations.getTbpttFwdLength();
-        update(TaskUtils.buildTask(input, labels));
-        val timeSeriesLength = input.size(2);
-        long nSubsets = timeSeriesLength / fwdLen;
-        if (timeSeriesLength % fwdLen != 0)
-            nSubsets++; //Example: 100 fwdLen with timeSeriesLength=120 -> want 2 subsets (1 of size 100, 1 of size 20)
-
-        rnnClearPreviousState();
-
-        for (int i = 0; i < nSubsets; i++) {
-            long startTimeIdx = (long) i * fwdLen;
-            long endTimeIdx = startTimeIdx + fwdLen;
-            if (endTimeIdx > timeSeriesLength)
-                endTimeIdx = timeSeriesLength;
-
-            if (startTimeIdx > Integer.MAX_VALUE || endTimeIdx > Integer.MAX_VALUE)
-                throw new ND4JArraySizeException();
-            INDArray[] subsets = getSubsetsForTbptt((int) startTimeIdx, (int) endTimeIdx, input, labels,
-                    featuresMaskArray, labelsMaskArray);
-
-            setInput(subsets[0]);
-            setLabels(subsets[1]);
-            setLayerMaskArrays(subsets[2], subsets[3]);
-
-            if (solver == null) {
-                try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                    solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
-                            .build();
-                }
-            }
-            solver.optimize(workspaceMgr);
-
-            //Finally, update the state of the RNN layers:
-            updateRnnStateWithTBPTTState();
-        }
-
-        rnnClearPreviousState();
-        clearLayerMaskArrays();
-    }
-
-    private INDArray[] getSubsetsForTbptt(int startTimeIdx, int endTimeIdx, INDArray input, INDArray labels,
-                                          INDArray fMask, INDArray lMask ){
-        INDArray[] out = new INDArray[4];
-        out[0] = input.get(NDArrayIndex.all(), NDArrayIndex.all(),
-                NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-        out[1] = labels.get(NDArrayIndex.all(), NDArrayIndex.all(),
-                NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-
-        if (fMask != null) {
-            out[2] = fMask.get(NDArrayIndex.all(),
-                    NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-        }
-        if (lMask != null) {
-            out[3] = lMask.get(NDArrayIndex.all(),
-                    NDArrayIndex.interval(startTimeIdx, endTimeIdx));
-        }
-
-        return out;
-    }
-
-    /**
-     * Intended for internal/developer use
+    //calculate and apply the backward gradient for every layer
+    /*
+     * Skip the output layer for the indexing and just loop backwards updating the coefficients for each layer.
+     * (when withOutputLayer == true)
+     *
+     * Activate applies the activation function for each layer and sets that as the input for the following layer.
+     *
+     * Typical literature contains most trivial case for the error calculation: wT * weights
+     * This interpretation transpose a few things to get mini batch because ND4J is rows vs columns organization for params
      */
-    public void updateRnnStateWithTBPTTState() {
-        for (int i = 0; i < layers.length; i++) {
+    int numLayers = getnLayers();
+    //Store gradients is a list; used to ensure iteration order in DefaultGradient linked hash map. i.e., layer 0 first instead of output layer
+    LinkedList<Triple<String, INDArray, Character>> gradientList = new LinkedList<>();
+
+    Pair<Gradient, INDArray> currPair = null;
+    MemoryWorkspace wsActGradCloseNext = null;
+    MemoryWorkspace wsActGradTemp = null;
+    MemoryWorkspace initialWorkspace = Nd4j.getMemoryManager().getCurrentWorkspace();
+
+    boolean traceLog = log.isTraceEnabled();
+
+    Throwable t = null;
+    try {
+      for (int i = layers.length - 1; i >= 0; i--) {
+        if (layers[i] instanceof FrozenLayer) {
+          break;
+        }
+
+        if (traceLog) {
+          log.trace("About to backprop: {} - {}", i, layers[i].getClass().getSimpleName());
+        }
+
+        LayerWorkspaceMgr workspaceMgr = (i % 2 == 0 ? mgrEven : mgrOdd);
+
+        if (withOutputLayer && i == layers.length - 1) {
+          if (!(getOutputLayer() instanceof IOutputLayer)) {
+            log.warn(
+                "Warning: final layer isn't output layer. You cannot use backprop without an output layer.");
+            return null;
+          }
+
+          IOutputLayer outputLayer = (IOutputLayer) getOutputLayer();
+          if (labels == null && outputLayer.needsLabels()) {
+            throw new IllegalStateException("No labels found");
+          }
+          outputLayer.setLabels(labels);
+        }
+
+        //Open activation gradients WS *then* BP working memory, so BP working memory is opened last for use in layers
+        wsActGradTemp = workspaceMgr.notifyScopeEntered(ArrayType.ACTIVATION_GRAD);
+        try (MemoryWorkspace wsBPWorking = workspaceMgr.notifyScopeEntered(
+            ArrayType.BP_WORKING_MEM)) {
+
+          //Note that because we're opening activation workspaces not in a simple nested order, we'll manually
+          // override the previous workspace setting. Otherwise, when we close these workspaces, the "current"
+          // workspace may be set to the incorrect one
+          wsActGradTemp.setPreviousWorkspace(initialWorkspace);
+          wsBPWorking.setPreviousWorkspace(initialWorkspace);
+
+          INDArray eps = (i == layers.length - 1 ? epsilon
+              : currPair.getRight());  //eps is null for OutputLayer
+
+          if (!tbptt) {
+            //Standard case
+            currPair = layers[i].backpropGradient(eps, workspaceMgr);
+          } else {
+            //TBPTT gradient
             if (layers[i] instanceof RecurrentLayer) {
-                RecurrentLayer l = ((RecurrentLayer) layers[i]);
-                l.rnnSetPreviousState(l.rnnGetTBPTTState());
-            } else if (layers[i] instanceof MultiLayerNetwork) {
-                ((MultiLayerNetwork) layers[i]).updateRnnStateWithTBPTTState();
-            }
-        }
-    }
-
-    /**
-     * Get the {@link TrainingListener}s set for this network, if any
-     * @return listeners set for this network
-     */
-    public Collection<TrainingListener> getListeners() {
-        return trainingListeners;
-    }
-
-    /**
-     * @deprecated Use {@link #getListeners()}
-     */
-    @Deprecated
-    public Collection<TrainingListener> getTrainingListeners() {
-        return trainingListeners;
-    }
-
-    @Override
-    public void setListeners(Collection<TrainingListener> listeners) {
-        if (layers == null) {
-            init();
-        }
-        for (Layer layer : layers) {
-            layer.setListeners(listeners);
-        }
-
-        if (solver != null) {
-            solver.setListeners(listeners);
-        }
-
-        this.trainingListeners.clear();
-        if (listeners != null) {
-            this.trainingListeners.addAll(listeners);
-        }
-    }
-
-    /**
-     * This method ADDS additional TrainingListener to existing listeners
-     *
-     * @param listeners
-     */
-    @Override
-    public void addListeners(TrainingListener... listeners) {
-        Collections.addAll(trainingListeners, listeners);
-
-        // fixme this is wrong, since it removes existing listeners from the solver
-        if (solver != null) {
-            solver.setListeners(this.trainingListeners);
-        }
-    }
-
-    @Override
-    public void setListeners(TrainingListener... listeners) {
-        Collection<TrainingListener> cListeners = new ArrayList<>();
-        //Check: user might have done setListeners(null) thinking this would clear the current listeners.
-        //This results in an TrainingListener[1] with a single null value -> results in a NPE later
-        if (listeners != null && listeners.length > 0) {
-            for (TrainingListener i : listeners) {
-                if (i != null)
-                    cListeners.add(i);
-            }
-        }
-        setListeners(cListeners);
-    }
-
-    /**
-     * Usable only for classification networks in conjunction with OutputLayer. Cannot be used with RnnOutputLayer,
-     * CnnLossLayer, or networks used for regression.<br>
-     * To get the raw output activations of the output layer, use {@link #output(INDArray)} or similar.<br>
-     * <br>
-     * Equivalent to argmax(this.output(input)): Returns the predicted class indices corresponding to the predictions
-     * for each example in the features array.
-     *
-     * @param d The input features to perform inference on
-     * @return The predicted class index for each example
-     */
-    @Override
-    public int[] predict(INDArray d) {
-        INDArray output = output(d, Layer.TrainingMode.TEST);
-
-        if (d.size(0) > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-
-        Preconditions.checkState(output.rank() == 2, "predict(INDArray) method can only be used on rank 2 output - got array with rank %s", output.rank());
-        return output.argMax(1).toIntVector();
-    }
-
-    /**
-     * As per {@link #predict(INDArray)} but the returned values are looked up from the list of label names
-     * in the provided DataSet
-     */
-    @Override
-    public List<String> predict(org.nd4j.linalg.dataset.api.DataSet dataSet) {
-        Preconditions.checkState(dataSet.getLabelNamesList() != null, "This method can only be used when the DataSet contains a label name list");
-        int[] intRet = predict(dataSet.getFeatures());
-        List<String> ret = new ArrayList<>();
-        for (int i = 0; i < intRet.length; i++) {
-            ret.add(i, dataSet.getLabelName(intRet[i]));
-        }
-        return ret;
-    }
-
-    /**
-     * Fit the model for one iteration on the provided data
-     *
-     * @param data   the examples to classify (one example in each row)
-     * @param labels the example labels(a binary outcome matrix)
-     */
-    @Override
-    public void fit(INDArray data, INDArray labels) {
-        fit(data, labels, null, null);
-    }
-
-    /**
-     * Fit the model for one iteration on the provided data
-     *
-     * @param features   the examples to classify (one example in each row)
-     * @param labels the example labels(a binary outcome matrix)
-     * @param featuresMask The mask array for the features (used for variable length time series, etc). May be null.
-     * @param labelsMask The mask array for the labels (used for variable length time series, etc). May be null.
-     */
-    public synchronized void fit(INDArray features, INDArray labels, INDArray featuresMask, INDArray labelsMask) {
-        try{
-            fitHelper(features, labels, featuresMask, labelsMask);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    private void fitHelper(INDArray features, INDArray labels, INDArray featuresMask, INDArray labelsMask){
-        if(numParams() == 0) {
-            //No op: can't fit a network with 0 parameters
-            return;
-        }
-
-        setInput(features);
-        setLabels(labels);
-        this.setLayerMaskArrays(featuresMask, labelsMask);
-        update(TaskUtils.buildTask(features, labels));
-
-        LayerWorkspaceMgr workspaceMgr;
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == null){
-            workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            workspaceMgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    //Note for updater working memory, we have the option to re-use WS_ALL_LAYERS_ACT or FF/BP_WORKING_MEM
-                    // these should be closed by the time updaters are executed
-                    //Generally, WS_ALL_LAYERS_ACT will be the larger of the two, so we'll use this
-                    .with(ArrayType.UPDATER_WORKING_MEM, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .build();
-        }
-        workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        if (layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT) {
-            doTruncatedBPTT(features, labels, featuresMask, labelsMask, workspaceMgr);
-        } else {
-            if (solver == null) {
-                try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                    solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this).build();
-                }
-            }
-            //TODO CACHE WORKSPACE, IF USED???
-            solver.optimize(workspaceMgr);
-        }
-
-        clearLayerMaskArrays();
-        clearLayersStates();
-        synchronizeIterEpochCounts();
-    }
-
-    @Override
-    public void fit(INDArray data, LayerWorkspaceMgr workspaceMgr){
-        throw new UnsupportedOperationException("Not supported: use pretrainLayer");
-    }
-
-
-    /**
-     * Fit the model for one iteration on the provided data
-     *
-     * @param data the data to train on
-     */
-    @Override
-    public void fit(org.nd4j.linalg.dataset.api.DataSet data) {
-        fit(data.getFeatures(), data.getLabels(), data.getFeaturesMaskArray(), data.getLabelsMaskArray());
-    }
-
-    /**
-     * Fit the model for one iteration on the provided data
-     *
-     * @param examples the examples to classify (one example in each row)
-     * @param labels   the labels for each example (the number of labels must match
-     */
-    @Override
-    public void fit(INDArray examples, int[] labels) {
-        org.deeplearning4j.nn.conf.layers.OutputLayer layerConf =
-                (org.deeplearning4j.nn.conf.layers.OutputLayer) getOutputLayer().conf().getLayer();
-
-        if (layerConf.getNOut() > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-        fit(examples, FeatureUtil.toOutcomeMatrix(labels, (int) layerConf.getNOut()));
-    }
-
-
-    /**
-     * Perform inference on the provided input/features - i.e., perform forward pass using the provided input/features
-     * and return the output of the final layer.
-     *
-     * @param input Input to the network
-     * @param train whether the output is test or train. This mainly affect hyper parameters such as dropout and
-     *              batch normalization, which have different behaviour for test vs. train
-     * @return The network predictions - i.e., the activations of the final layer
-     */
-    public INDArray output(INDArray input, TrainingMode train) {
-        return output(input, train == TrainingMode.TRAIN);
-    }
-
-    /**
-     * Perform inference on the provided input/features - i.e., perform forward pass using the provided input/features
-     * and return the output of the final layer.
-     *
-     * @param input Input to the network
-     * @param train whether the output is test or train. This mainly affect hyper parameters such as dropout and
-     *              batch normalization, which have different behaviour for test vs. train
-     * @return The network predictions - i.e., the activations of the final layer
-     */
-    public INDArray output(INDArray input, boolean train) {
-        return output(input, train, null, null);
-    }
-
-    /**
-     * Calculate the output of the network, with masking arrays. The masking arrays are used in situations such
-     * as one-to-many and many-to-one recurrent neural network (RNN) designs, as well as for supporting time series
-     * of varying lengths within the same minibatch.
-     */
-    public INDArray output(INDArray input, boolean train, INDArray featuresMask, INDArray labelsMask) {
-        return output(input, train, featuresMask, labelsMask, null);
-    }
-
-    /**
-     * Get the network output, which is optionally placed in the specified memory workspace.<br>
-     * If no memory workspace is provided, the output will be detached (not in any workspace).<br>
-     * If a memory workspace is provided, the output activation array (i.e., the INDArray returned by this method)
-     * will be placed in the specified workspace. This workspace must be opened by the user before calling this method -
-     * and the user is responsible for (a) closing this workspace, and (b) ensuring the output array is not used out
-     * of scope (i.e., not used after closing the workspace to which it belongs - as this is likely to cause either
-     * an exception when used, or a crash).
-     *
-     * @param input           Input to the network
-     * @param train           True for train, false otherwise
-     * @param outputWorkspace May be null. If not null: the workspace MUST be opened before calling this method.
-     * @return The output/activations from the network (either detached or in the specified workspace if provided)
-     */
-    public INDArray output(INDArray input, boolean train, MemoryWorkspace outputWorkspace) {
-        return output(input, train, null, null, outputWorkspace);
-    }
-
-    /**
-     * Get the network output, which is optionally placed in the specified memory workspace.<br>
-     * If no memory workspace is provided, the output will be detached (not in any workspace).<br>
-     * If a memory workspace is provided, the output activation array (i.e., the INDArray returned by this method)
-     * will be placed in the specified workspace. This workspace must be opened by the user before calling this method -
-     * and the user is responsible for (a) closing this workspace, and (b) ensuring the output array is not used out
-     * of scope (i.e., not used after closing the workspace to which it belongs - as this is likely to cause either
-     * an exception when used, or a crash).
-     *
-     * @param input           Input to the network
-     * @param train           True for train, false otherwise
-     * @param outputWorkspace May be null. If not null: the workspace MUST be opened before calling this method.
-     * @return The output/activations from the network (either detached or in the specified workspace if provided)
-     */
-    public synchronized INDArray output(INDArray input, boolean train, INDArray featuresMask, INDArray labelsMask, MemoryWorkspace outputWorkspace) {
-        try {
-            return outputOfLayerDetached(train, FwdPassType.STANDARD, layers.length - 1, input, featuresMask, labelsMask, outputWorkspace);
-        } catch (OutOfMemoryError e) {
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    /**
-     * This method uses provided OutputAdapter to return custom object built from INDArray
-     *
-     * PLEASE NOTE: This method uses dedicated Workspace for output generation to avoid redundant allocations
-     *
-     * @param inputs Input arrays to the netwonk
-     * @param inputMasks Optional input mask arrays (may be null)
-     * @param labelMasks Optional label mask arrays (may be null
-     * @param outputAdapter OutputAdapter<T> instance
-     * @param <T> T extends Object
-     * @return T instance produced by OutputAdapter
-     */
-    public synchronized <T> T output(@NonNull INDArray inputs, INDArray inputMasks, INDArray labelMasks, @NonNull OutputAdapter<T> outputAdapter) {
-        try (val ws = Nd4j.getWorkspaceManager().getAndActivateWorkspace(WS_ALL_LAYERS_ACT_CONFIG, WS_OUTPUT_MEM)) {
-            if (outputAdapter instanceof ModelAdapter)
-                return ((ModelAdapter<T>) outputAdapter).apply(this, new INDArray[]{inputs}, new INDArray[]{ inputMasks}, new INDArray[]{labelMasks});
-            else
-                return outputAdapter.apply(output(inputs, false, inputMasks, labelMasks, ws));
-        }
-    }
-
-    /**
-     * Perform inference on the provided input/features - i.e., perform forward pass using the provided input/features
-     * and return the output of the final layer. Equivalent to {@link #output(INDArray, boolean)} with train=false - i.e.,
-     * this method is used for inference.
-     *
-     * @param input Input to the network
-     * @return The network predictions - i.e., the activations of the final layer
-     */
-    public INDArray output(INDArray input) {
-        return output(input, TrainingMode.TEST);
-    }
-
-    /**
-     * Generate the output for all examples/batches in the input iterator, and concatenate them into a single array.
-     * See {@link #output(INDArray)}<br>
-     * NOTE 1: The output array can require a considerable amount of memory for iterators with a large number of examples<br>
-     * NOTE 2: This method cannot be used for variable length time series outputs, as this would require padding arrays
-     * for some outputs, or returning a mask array (which cannot be done with this method). For variable length time
-     * series applications, use one of the other output methods. This method also cannot be used with fully convolutional
-     * networks with different output sizes (for example, segmentation on different input image sizes).
-     *
-     *
-     * @param iterator Data to pass through the network
-     * @return output for all examples in the iterator, concatenated into a
-     */
-    public INDArray output(DataSetIterator iterator, boolean train) {
-        List<INDArray> outList = new ArrayList<>();
-        long[] firstOutputShape = null;
-        while (iterator.hasNext()) {
-            DataSet next = iterator.next();
-            INDArray features = next.getFeatures();
-
-            if (features == null)
-                continue;
-
-            INDArray fMask = next.getFeaturesMaskArray();
-            INDArray lMask = next.getLabelsMaskArray();
-            INDArray output = this.output(features, train, fMask, lMask);
-            outList.add(output);
-            if(firstOutputShape == null){
-                firstOutputShape = output.shape();
+              currPair = ((RecurrentLayer) layers[i]).tbpttBackpropGradient(currPair.getSecond(),
+                  layerWiseConfigurations.getTbpttBackLength(), workspaceMgr);
             } else {
-                //Validate that shapes are the same (may not be, for some RNN variable length time series applications)
-                long[] currShape = output.shape();
-                Preconditions.checkState(firstOutputShape.length == currShape.length, "Error during forward pass:" +
-                        "different minibatches have different output array ranks - first minibatch shape %s, last minibatch shape %s", firstOutputShape, currShape);
-                for( int i=1; i<currShape.length; i++ ){    //Skip checking minibatch dimension, fine if this varies
-                    Preconditions.checkState(firstOutputShape[i] == currShape[i], "Current output shape does not match first" +
-                                    " output array shape at position %s: all dimensions must match other than the first dimension.\n" +
-                                    " For variable length output size/length use cases such as for RNNs with multiple sequence lengths," +
-                                    " use one of the other (non iterator) output methods. First batch output shape: %s, current batch output shape: %s",
-                            i, firstOutputShape, currShape);
-                }
+              currPair = layers[i].backpropGradient(currPair.getSecond(), workspaceMgr);
             }
-        }
-        return Nd4j.concat(0, outList.toArray(new INDArray[outList.size()]));
-    }
+          }
 
-    /**
-     * Equivalent to {@link #output(DataSetIterator, boolean)} with train=false
-     */
-    public INDArray output(DataSetIterator iterator) {
-        return output(iterator, false);
-    }
+          if (currPair.getSecond() != null) {
+            //Edge case: may be null for Embedding layer, for example
+            validateArrayWorkspaces(workspaceMgr, currPair.getSecond(), ArrayType.ACTIVATION_GRAD,
+                i,
+                false, "Backprop");
+          }
 
-    /**
-     * Perform inference and then calculate the F1 score of the output(input) vs. the labels.
-     *
-     * @param input  the input to perform inference with
-     * @param labels the true labels
-     * @return the score for the given input,label pairs
-     */
-    @Override
-    public double f1Score(INDArray input, INDArray labels) {
-        feedForward(input);
-        setLabels(labels);
-        Evaluation eval = new Evaluation();
-        eval.eval(labels, output(input));
-        return eval.f1();
-    }
-
-    /**
-     * @deprecated Will be removed in a future release
-     */
-    @Deprecated
-    @Override
-    public int numLabels() {
-        return (int)labels.size(1);
-    }
-
-    /**
-     * Sets the input and labels and calculates the score (value of the output layer loss function plus l1/l2 if applicable)
-     * for the prediction with respect to the true labels<br>
-     * This is equivalent to {@link #score(DataSet, boolean)} with training==false.
-     * @param data the data to score
-     * @return the score for the given input,label pairs
-     * @see #score(DataSet, boolean)
-     */
-    public double score(DataSet data) {
-        return score(data, false);
-    }
-
-    /**
-     * Sets the input and labels and calculates the score (value of the output layer loss function plus l1/l2 if applicable)
-     * for the prediction with respect to the true labels<br>
-     * @param data data to calculate score for
-     * @param training If true: score during training. If false: score at test time. This can affect the application of
-     *                 certain features, such as dropout and dropconnect (which are applied at training time only)
-     * @return the score (value of the loss function)
-     */
-    public double score(DataSet data, boolean training) {
-        try{
-            return scoreHelper(data, training);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    private double scoreHelper(DataSet data, boolean training){
-        boolean hasMaskArray = data.hasMaskArrays();
-        if (hasMaskArray)
-            setLayerMaskArrays(data.getFeaturesMaskArray(), data.getLabelsMaskArray());
-
-        if (!(getOutputLayer() instanceof IOutputLayer)) {
-            throw new IllegalStateException("Cannot calculate score if final layer is not an instance of IOutputLayer. " +
-                    "Final layer is of type: " + getOutputLayer().getClass());
-        }
-
-        WorkspaceMode wsm = (training ? layerWiseConfigurations.getTrainingWorkspaceMode() : layerWiseConfigurations.getInferenceWorkspaceMode());
-        LayerWorkspaceMgr mgr;
-        if(wsm == WorkspaceMode.NONE){
-            mgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            mgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    //TODO we can probably optimize this
-                    .noWorkspaceFor(ArrayType.ACTIVATIONS)
-                    .noWorkspaceFor(ArrayType.INPUT)
-                    .build();
-        }
-        mgr.setHelperWorkspacePointers(helperWorkspaces);
-
-        INDArray inputToOutputLayer = outputOfLayerDetached(training, FwdPassType.STANDARD,layers.length-2, data.getFeatures(),
-                data.getFeaturesMaskArray(), data.getLabelsMaskArray(), null);
-
-        if (data.getFeatures().size(0) > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-        IOutputLayer ol = (IOutputLayer) getOutputLayer();
-        if (getLayerWiseConfigurations().getInputPreProcess(layers.length - 1) != null) {
-            inputToOutputLayer = getLayerWiseConfigurations().getInputPreProcess(layers.length - 1)
-                    .preProcess(inputToOutputLayer, (int) data.getFeatures().size(0), mgr);
-        }
-        ol.setInput(inputToOutputLayer, mgr); //Feedforward doesn't include output layer for efficiency
-        ol.setLabels(data.getLabels());
-        double score;
-        try(MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
-            score = ol.computeScore(calcRegularizationScore(true), training, mgr);
-        }
-
-        if (hasMaskArray)
-            clearLayerMaskArrays();
-        clearLayersStates();
-
-        return score;
-    }
-
-    /**
-     * As per {@link #scoreExamples(DataSet, boolean)} - the outputs (example scores) for all DataSets in the iterator are concatenated
-     */
-    public INDArray scoreExamples(DataSetIterator iter, boolean addRegularizationTerms) {
-        List<INDArray> out = new ArrayList<>();
-
-        while (iter.hasNext()) {
-            out.add(scoreExamples(iter.next(), addRegularizationTerms));
-        }
-        return Nd4j.toFlattened('f', out);
-    }
-
-    /**Calculate the score for each example in a DataSet individually. Unlike {@link #score(DataSet)} and {@link #score(DataSet, boolean)}
-     * this method does not average/sum over examples. This method allows for examples to be scored individually (at test time only), which
-     * may be useful for example for autoencoder architectures and the like.<br>
-     * Each row of the output (assuming addRegularizationTerms == true) is equivalent to calling score(DataSet) with a single example.
-     * @param data The data to score
-     * @param addRegularizationTerms If true: add l1/l2 regularization terms (if any) to the score. If false: don't add regularization terms
-     * @return An INDArray (column vector) of size input.numRows(); the ith entry is the score (loss value) of the ith example
-     */
-    public INDArray scoreExamples(DataSet data, boolean addRegularizationTerms) {
-        try{
-            return scoreExamplesHelper(data, addRegularizationTerms);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    private INDArray scoreExamplesHelper(DataSet data, boolean addRegularizationTerms){
-        INDArray inputLast = outputOfLayerDetached(false, FwdPassType.STANDARD,layers.length-2, data.getFeatures(),
-                data.getFeaturesMaskArray(), data.getLabelsMaskArray(), null);
-        setLabels(data.getLabels());
-        setLayerMaskArrays(data.getFeaturesMaskArray(), data.getLabelsMaskArray());
-
-        //TODO we might want workspaces here?
-        LayerWorkspaceMgr mgr = LayerWorkspaceMgr.noWorkspaces();
-
-        INDArray out;
-        if (getOutputLayer() instanceof IOutputLayer) {
-            IOutputLayer ol = (IOutputLayer) getOutputLayer();
-            if(layerWiseConfigurations.getInputPreProcess(layers.length-1) != null){
-
-                if (data.getFeatures().size(0) > Integer.MAX_VALUE)
-                    throw new ND4JArraySizeException();
-                inputLast = layerWiseConfigurations.getInputPreProcess(layers.length-1).preProcess(inputLast,
-                        (int) data.getFeatures().size(0), mgr);
+          for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable()
+              .entrySet()) {
+            String origName = entry.getKey();
+            multiGradientKey = i + "_" + origName;
+            gradientList.addLast(new Triple<>(multiGradientKey, entry.getValue(),
+                currPair.getFirst().flatteningOrderForVariable(origName)));
+          }
+          if (getLayerWiseConfigurations().getInputPreProcess(i) != null) {
+            currPair = new Pair<>(currPair.getFirst(),
+                this.layerWiseConfigurations.getInputPreProcess(i)
+                    .backprop(currPair.getSecond(), getInputMiniBatchSize(), workspaceMgr));
+            if (i > 0 && currPair.getSecond() != null) {
+              validateArrayWorkspaces(workspaceMgr, currPair.getSecond(), ArrayType.ACTIVATION_GRAD,
+                  i,
+                  true, "Backprop");
             }
-            ol.setLabels(data.getLabels());
-            ol.setInput(inputLast, mgr);
-            double r = (addRegularizationTerms ? calcRegularizationScore(true) : 0);
-            out = ol.computeScoreForExamples(r, mgr);
-        } else {
-            throw new UnsupportedOperationException(
-                    "Cannot calculate score with respect to labels without an OutputLayer");
-        }
+          }
 
-        clearLayersStates();
-        clearLayerMaskArrays();
-        return out;
-    }
-
-
-    @Override
-    public void fit() {
-        fit(input, labels);
-    }
-
-    @Override
-    public void update(INDArray gradient, String paramType) {
-        throw new UnsupportedOperationException("Not implemented");
-    }
-
-
-    /**
-     * Score of the model (relative to the objective function) - previously calculated on the last minibatch
-     *
-     * @return the score of the model (relative to the objective function)
-     */
-    @Override
-    public double score() {
-        return score;
-    }
-
-    /**
-     * Intended for developer/internal use
-     */
-    public void setScore(double score) {
-        this.score = score;
-    }
-
-    @Override
-    public void computeGradientAndScore(LayerWorkspaceMgr layerWorkspaceMgr){
-        computeGradientAndScore();
-    }
-
-    public void computeGradientAndScore() {
-
-        if (!(getOutputLayer() instanceof IOutputLayer)) {
-            throw new DL4JException(
-                    "Cannot calculate gradient and score with respect to labels: final layer is not an IOutputLayer. " +
-                            "Final layer class: " + getOutputLayer().getClass() + ". To calculate gradients and fit a network " +
-                            "using backpropagation, the final layer must be an output layer");
-        }
-
-        //Note: Workspace manager is only ose here for score calculation... other workspace managers are used in the
-        // various FF/backprop methds
-        LayerWorkspaceMgr mgr;
-        if(layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE){
-            mgr = LayerWorkspaceMgr.noWorkspaces();
-        } else {
-            mgr = LayerWorkspaceMgr.builder()
-                    .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
-                    .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM_CONFIG)
-                    .build();
-
-            if(layerWiseConfigurations.getCacheMode() != null){
-                //For now: store cache mode activations in activations workspace
-                mgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
+          if (i == 0) {
+            if (returnInputActGrad && currPair.getSecond() != null) {
+              currPair.setSecond(currPair.getSecond().detach());
+            } else {
+              currPair.setSecond(null);
             }
+          }
+
+          if (wsActGradCloseNext != null) {
+            wsActGradCloseNext.close();
+          }
+          wsActGradCloseNext = wsActGradTemp;
+          wsActGradTemp = null;
         }
 
-        boolean tbptt = layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT;
-        FwdPassType fwdType = (tbptt ? FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE : FwdPassType.STANDARD);
-        synchronizeIterEpochCounts();
-
-        //Calculate activations (which are stored in each layer, and used in backprop)
-        try(MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS)) {
-            //First: do a feed-forward through the network
-            //Note that we don't actually need to do the full forward pass through the output layer right now; but we do
-            // need the input to the output layer to be set (such that backprop can be done)
-            List<INDArray> activations = ffToLayerActivationsInWs(layers.length - 2, fwdType, tbptt, input, mask, null);
-            if (!trainingListeners.isEmpty()) {
-                //TODO: We possibly do want output layer activations in some cases here...
-                for (TrainingListener tl : trainingListeners) {
-                    tl.onForwardPass(this, activations);
-                }
-            }
-            INDArray inputToOutputLayer = activations.get(activations.size() - 1);
-            if (layerWiseConfigurations.getInputPreProcess(layers.length - 1) != null) {
-                inputToOutputLayer = layerWiseConfigurations.getInputPreProcess(layers.length - 1)
-                        .preProcess(inputToOutputLayer, getInputMiniBatchSize(), mgr);
-                //Validate activations location
-            }
-            getOutputLayer().setInput(inputToOutputLayer, mgr);
-            //Then: compute gradients
-            Pair<Gradient, INDArray> pair = calcBackpropGradients(null, true, false, false);
-            this.gradient = (pair == null ? null : pair.getFirst());
-
-            //Calculate score
-            try(MemoryWorkspace wsFF = mgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
-                double r = calcRegularizationScore(true);
-                score = ((IOutputLayer) getOutputLayer()).computeScore(r, true, mgr);
-            }
-
-            //Listeners
-            if (!trainingListeners.isEmpty()) {
-                try (MemoryWorkspace workspace = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-                    for (TrainingListener tl : trainingListeners) {
-                        tl.onBackwardPass(this);
-                    }
-                }
-            }
+        if (traceLog) {
+          log.trace("Completed backprop: {} - {}", i, layers[i].getClass().getSimpleName());
         }
-
-        //Clear the post noise/dropconnect parameters on the output layer
-        getOutputLayer().clearNoiseWeightParams();
-    }
-
-    /**
-     * Clear the inputs. Clears optimizer state.
-     */
-    public void clear() {
-        for (Layer layer : layers)
-            layer.clear();
-
-        input = null;
-        labels = null;
-        solver = null;
-    }
-
-    @Override
-    public void applyConstraints(int iteration, int epoch) {
-        for(Layer l : layers){
-            l.applyConstraints(iteration, epoch);
-        }
-    }
-
-
-    /**
-     * Set the input array for the network
-     *
-     * @param input Input array to set
-     */
-    public void setInput(INDArray input) {
-        this.input = input;
-        if (this.layers == null) {
-            init();
-        }
-        if (input != null) {
-            if (input.length() == 0)
-                throw new IllegalArgumentException(
-                        "Invalid input: length 0 (shape: " + Arrays.toString(input.shape()) + ")");
-
-            if (input.size(0) > Integer.MAX_VALUE)
-                throw new ND4JArraySizeException();
-            setInputMiniBatchSize((int) input.size(0));
-        }
-    }
-
-    @Override
-    public void setInput(INDArray input, LayerWorkspaceMgr mgr){
-        throw new UnsupportedOperationException("Not supported");
-    }
-
-    /**
-     * Get the output layer - i.e., the last layer in the netwok
-     *
-     * @return
-     */
-    public Layer getOutputLayer() {
-        Layer ret = getLayers()[getLayers().length - 1];
-        if (ret instanceof FrozenLayerWithBackprop) {
-            ret = ((FrozenLayerWithBackprop) ret).getInsideLayer();
-        }
-        return ret;
-    }
-
-
-    /**
-     * See {@link #setParams(INDArray)}
-     */
-    public void setParameters(INDArray params) {
-        setParams(params);
-    }
-
-    /**
-     * Intended for internal/developer use
-     */
-    public NeuralNetConfiguration getDefaultConfiguration() {
-        return defaultConfiguration;
-    }
-
-    public INDArray getLabels() {
-        return labels;
-    }
-
-    public INDArray getInput() {
-        return input;
-    }
-
-
-    /**
-     * @param labels Labels to set
-     */
-    public void setLabels(INDArray labels) {
-        this.labels = labels;
-    }
-
-    /**
-     * Get the number of layers in the network
-     *
-     * @return the number of layers in the network
-     */
-    public int getnLayers() {
-        return layerWiseConfigurations.getConfs().size();
-    }
-
-    /**
-     * @return The layers in the network
-     */
-    public synchronized Layer[] getLayers() {
-        return layers;
-    }
-
-    public Layer getLayer(int i) {
-        Preconditions.checkArgument(i >= 0 && i < layers.length, "Invalid layer index: layer index must be 0" +
-                " to %s (inclusive), got index %s", layers.length-1, i);
-        return layers[i];
-    }
-
-    public Layer getLayer(String name) {
-        return layerMap.get(name);
-    }
-
-    public List<String> getLayerNames() {
-        return new ArrayList<>(layerMap.keySet());
-    }
-
-    public void setLayers(Layer[] layers) {
-        this.layers = layers;
-    }
-
-    public INDArray getMask() {
-        return mask;
-    }
-
-    public void setMask(INDArray mask) {
-        this.mask = mask;
-    }
-
-    public INDArray getMaskArray() {
-        return mask;
-    }
-
-    @Override
-    public boolean isPretrainLayer() {
-        return false;
-    }
-
-    @Override
-    public void clearNoiseWeightParams() {
-        for(Layer l : layers){
-            l.clearNoiseWeightParams();
-        }
-    }
-
-    @Override
-    public void allowInputModification(boolean allow) {
-        throw new UnsupportedOperationException("Not supported");
-    }
-
-    @Override
-    public Pair<INDArray, MaskState> feedForwardMaskArray(INDArray maskArray, MaskState currentMaskState,
-                                                          int minibatchSize) {
-        if (maskArray == null) {
-            for (int i = 0; i < layers.length; i++) {
-                layers[i].feedForwardMaskArray(null, null, minibatchSize);
-            }
-        } else {
-            //Do a forward pass through each preprocessor and layer
-            for (int i = 0; i < layers.length; i++) {
-                InputPreProcessor preProcessor = getLayerWiseConfigurations().getInputPreProcess(i);
-
-                if (preProcessor != null) {
-                    Pair<INDArray, MaskState> p =
-                            preProcessor.feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
-                    if (p != null) {
-                        maskArray = p.getFirst();
-                        currentMaskState = p.getSecond();
-                    } else {
-                        maskArray = null;
-                        currentMaskState = null;
-                    }
-                }
-
-                Pair<INDArray, MaskState> p =
-                        layers[i].feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
-                if (p != null) {
-                    maskArray = p.getFirst();
-                    currentMaskState = p.getSecond();
-                } else {
-                    maskArray = null;
-                    currentMaskState = null;
-                }
-            }
-        }
-
-        return new Pair<>(maskArray, currentMaskState);
-    }
-
-    @Override
-    public LayerHelper getHelper() {
-        throw new UnsupportedOperationException("Not supported");
-    }
-
-    //==========
-    //Layer methods
-
-    @Override
-    public Type type() {
-        return Type.MULTILAYER;
-    }
-
-
-    /**
-     * Equivalent to {@link #output(INDArray)} using the input set via {@link #setInput(INDArray)}
-     */
-    public INDArray activate(TrainingMode training) {
-        return output(input, training == TrainingMode.TRAIN);
-    }
-
-    /**
-     * Equivalent to {@link #output(INDArray, TrainingMode)}
-     */
-    public INDArray activate(INDArray input, TrainingMode training) {
-        return output(input, training == TrainingMode.TRAIN);
-    }
-
-    @Override
-    public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
-        if (getOutputLayer() instanceof IOutputLayer)
-            throw new UnsupportedOperationException("Cannot calculate gradients based on epsilon with OutputLayer");
-
-        return calcBackpropGradients(epsilon, false, false, true);
-    }
-
-    @Override
-    public void setIndex(int index) {
-        layerIndex = index;
-    }
-
-    @Override
-    public int getIndex() {
-        return layerIndex;
-    }
-
-    @Override
-    public int getIterationCount() {
-        return getLayerWiseConfigurations().getIterationCount();
-    }
-
-    @Override
-    public int getEpochCount() {
-        return getLayerWiseConfigurations().getEpochCount();
-    }
-
-    @Override
-    public void setIterationCount(int iterationCount) {
-        getLayerWiseConfigurations().setIterationCount(iterationCount);
-    }
-
-    @Override
-    public void setEpochCount(int epochCount) {
-        getLayerWiseConfigurations().setEpochCount(epochCount);
-    }
-
-    @Override
-    public double calcRegularizationScore(boolean backpropParamsOnly){
-        double scoreSum = 0.0;
-        for (int i = 0; i < layers.length; i++) {
-            scoreSum += layers[i].calcRegularizationScore(backpropParamsOnly);
-        }
-        return scoreSum;
-    }
-
-    @Override
-    public void update(Gradient gradient) {
-        if (gradient.gradient().length() != numParams(true))
-            throw new IllegalArgumentException("Invalid input: expect gradients array of length " + numParams(true));
-        for (Map.Entry<String, INDArray> entry : gradient.gradientForVariable().entrySet()) {
-            String key = entry.getKey();
-            INDArray val = entry.getValue();
-            int idx = key.indexOf('_');
-            if (idx == -1)
-                throw new IllegalStateException("Invalid param key: not have layer separator: \"" + key + "\"");
-            Integer layerId = Integer.parseInt(key.substring(0, idx));
-            String paramType = key.substring(idx + 1);
-            // Update MLN gradient
-            this.gradient.gradientForVariable().put(key, val);
-            // Update layer params
-            layers[layerId].update(val, paramType);
-        }
-        // Update layerwise gradient view
-        setBackpropGradientsViewArray(gradient.gradient());
-
-    }
-
-    @Override
-    public INDArray activate(boolean training, LayerWorkspaceMgr mgr) {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public INDArray activate(INDArray input, boolean training, LayerWorkspaceMgr mgr) {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public void setInputMiniBatchSize(int size) {
-        if (layers != null)
-            for (Layer l : layers)
-                l.setInputMiniBatchSize(size);
-    }
-
-    @Override
-    public int getInputMiniBatchSize() {
-        if(!conf().isMiniBatch())
-            return 1;
-
-        if (input.size(0) > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-        return (int) input.size(0);
-    }
-
-    @Override
-    public void setMaskArray(INDArray maskArray) {
-        throw new UnsupportedOperationException();
-    }
-
-    /**
-     *
-     * If this MultiLayerNetwork contains one or more RNN layers: conduct forward pass (prediction)
-     * but using previous stored state for any RNN layers. The activations for the final step are
-     * also stored in the RNN layers for use next time rnnTimeStep() is called.<br>
-     * This method can be used to generate output one or more steps at a time instead of always having to do
-     * forward pass from t=0. Example uses are for streaming data, and for generating samples from network output
-     * one step at a time (where samples are then fed back into the network as input)<br>
-     * If no previous state is present in RNN layers (i.e., initially or after calling rnnClearPreviousState()),
-     * the default initialization (usually 0) is used.<br>
-     * Supports mini-batch (i.e., multiple predictions/forward pass in parallel) as well as for single examples.<br>
-     * @param input Input to network. May be for one or multiple time steps. For single time step:
-     *  input has shape [miniBatchSize,inputSize] or [miniBatchSize,inputSize,1]. miniBatchSize=1 for single example.<br>
-     *  For multiple time steps: [miniBatchSize,inputSize,inputTimeSeriesLength]
-     * @return Output activations. If output is RNN layer (such as RnnOutputLayer): if input has shape [miniBatchSize,inputSize]
-     * i.e., is 2d, output has shape [miniBatchSize,outputSize] (i.e., also 2d).<br>
-     * Otherwise output is 3d [miniBatchSize,outputSize,inputTimeSeriesLength] when using RnnOutputLayer.
-     * @see #rnnTimeStep(INDArray, MemoryWorkspace) For outputting the activations in the specified workspace
-     */
-    public INDArray rnnTimeStep(INDArray input) {
-        return rnnTimeStep(input, null);
-    }
-
-    /**
-     * See {@link #rnnTimeStep(INDArray)} for details<br>
-     * If no memory workspace is provided, the output will be detached (not in any workspace).<br>
-     * If a memory workspace is provided, the output activation array (i.e., the INDArray returned by this method)
-     * will be placed in the specified workspace. This workspace must be opened by the user before calling this method -
-     * and the user is responsible for (a) closing this workspace, and (b) ensuring the output array is not used out
-     * of scope (i.e., not used after closing the workspace to which it belongs - as this is likely to cause either
-     * an exception when used, or a crash).
-     *
-     * @param input           Input activations
-     * @param outputWorkspace Output workspace. May be null
-     * @return The output/activations from the network (either detached or in the specified workspace if provided)
-     */
-    public INDArray rnnTimeStep(INDArray input, MemoryWorkspace outputWorkspace ) {
+      }
+    } catch (Throwable thr) {
+      t = thr;
+    } finally {
+      if (wsActGradCloseNext != null) {
         try {
-            boolean inputIs2d = input.rank() == 2;
-            INDArray out = outputOfLayerDetached(false, FwdPassType.RNN_TIMESTEP, layers.length - 1, input, null, null, outputWorkspace);
-            if (inputIs2d && out.rank() == 3 && layers[layers.length - 1].type() == Type.RECURRENT) {
-                //Return 2d output with shape [miniBatchSize,nOut]
-                // instead of 3d output with shape [miniBatchSize,nOut,1]
-                return out.tensorAlongDimension(0, 1, 0);
-            }
-            return out;
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
+          wsActGradCloseNext.close();
+        } catch (Throwable t2) {
+          if (t != null) {
+            log.error(
+                "Encountered second exception while trying to close workspace after initial exception");
+            log.error("Original exception:", t);
+            throw t2;
+          }
         }
-    }
-
-    /**Get the state of the RNN layer, as used in rnnTimeStep().
-     * @param layer Number/index of the layer.
-     * @return Hidden state, or null if layer is not an RNN layer
-     */
-    public Map<String, INDArray> rnnGetPreviousState(int layer) {
-        if (layer < 0 || layer >= layers.length)
-            throw new IllegalArgumentException("Invalid layer number");
-        Layer l = layers[layer];
-        if(l instanceof org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer){
-            l = ((org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer)l).getUnderlying();
+      }
+      if (wsActGradTemp != null) {
+        //Should only be non-null on exception
+        try {
+          wsActGradTemp.close();
+        } catch (Throwable t2) {
+          if (t != null) {
+            log.error(
+                "Encountered second exception while trying to close workspace after initial exception");
+            log.error("Original exception:", t);
+            throw t2;
+          }
         }
-        if (!(l instanceof RecurrentLayer))
-            throw new IllegalArgumentException("Layer is not an RNN layer");
-        return ((RecurrentLayer) l).rnnGetPreviousState();
-    }
+      }
+      Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
 
-    /**Set the state of the RNN layer.
-     * @param layer The number/index of the layer.
-     * @param state The state to set the specified layer to
-     */
-    public void rnnSetPreviousState(int layer, Map<String, INDArray> state) {
-        if (layer < 0 || layer >= layers.length)
-            throw new IllegalArgumentException("Invalid layer number");
-        Layer l = layers[layer];
-        if(l instanceof org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer){
-            l = ((org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer)l).getUnderlying();
+      if (t != null) {
+        if (t instanceof RuntimeException) {
+          throw ((RuntimeException) t);
         }
-        if (!(l instanceof RecurrentLayer))
-            throw new IllegalArgumentException("Layer is not an RNN layer");
-        RecurrentLayer r = (RecurrentLayer) l;
-        r.rnnSetPreviousState(state);
+        throw new RuntimeException("Error during neural network forward pass", t);
+      }
     }
 
-    /** Clear the previous state of the RNN layers (if any).
-     */
-    public void rnnClearPreviousState() {
-        if (layers == null)
-            return;
-        for (int i = 0; i < layers.length; i++) {
-            if (layers[i] instanceof RecurrentLayer)
-                ((RecurrentLayer) layers[i]).rnnClearPreviousState();
-            else if (layers[i] instanceof MultiLayerNetwork) {
-                ((MultiLayerNetwork) layers[i]).rnnClearPreviousState();
-            } else if(layers[i] instanceof BaseWrapperLayer && ((BaseWrapperLayer)layers[i]).getUnderlying() instanceof RecurrentLayer){
-                ((RecurrentLayer) ((BaseWrapperLayer)layers[i]).getUnderlying()).rnnClearPreviousState();
-            }
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      WorkspaceUtils.assertNoWorkspacesOpen(
+          "Expected no workspace active in calcBackpropGradients when " +
+              "training workspace is set to none");
+    } else {
+      if (epsilon == null) {
+        //If epsilon != null: external errors use case (inputs are detached instead)
+        WorkspaceUtils.assertOpenActiveAndCurrent(WS_ALL_LAYERS_ACT,
+            "calcBackpropGradients: WS_ALL_LAYERS_ACT is no" +
+                " longer the currently open/active workspace");
+      }
+    }
+
+    //Add gradients to Gradients (map), in correct order
+    for (Triple<String, INDArray, Character> triple : gradientList) {
+      gradient.setGradientFor(triple.getFirst(), triple.getSecond(), triple.getThird());
+    }
+
+    return new Pair<>(gradient, currPair.getSecond());
+  }
+
+  protected void doTruncatedBPTT(INDArray input, INDArray labels, INDArray featuresMaskArray,
+      INDArray labelsMaskArray, LayerWorkspaceMgr workspaceMgr) {
+    if (input.rank() != 3 || labels.rank() != 3) {
+      log.warn(
+          "Cannot do truncated BPTT with non-3d inputs or labels. Expect input with shape [miniBatchSize,nIn,timeSeriesLength], got "
+              + Arrays.toString(input.shape()) + "\tand labels with shape "
+              + Arrays.toString(labels.shape()));
+      return;
+    }
+    if (input.size(2) != labels.size(2)) {
+      log.warn(
+          "Input and label time series have different lengths: {} input length, {} label length",
+          input.size(2), labels.size(2));
+      return;
+    }
+
+    int fwdLen = layerWiseConfigurations.getTbpttFwdLength();
+    update(TaskUtils.buildTask(input, labels));
+    val timeSeriesLength = input.size(2);
+    long nSubsets = timeSeriesLength / fwdLen;
+    if (timeSeriesLength % fwdLen != 0) {
+      nSubsets++; //Example: 100 fwdLen with timeSeriesLength=120 -> want 2 subsets (1 of size 100, 1 of size 20)
+    }
+
+    rnnClearPreviousState();
+
+    for (int i = 0; i < nSubsets; i++) {
+      long startTimeIdx = (long) i * fwdLen;
+      long endTimeIdx = startTimeIdx + fwdLen;
+      if (endTimeIdx > timeSeriesLength) {
+        endTimeIdx = timeSeriesLength;
+      }
+
+      if (startTimeIdx > Integer.MAX_VALUE || endTimeIdx > Integer.MAX_VALUE) {
+        throw new ND4JArraySizeException();
+      }
+      INDArray[] subsets = getSubsetsForTbptt((int) startTimeIdx, (int) endTimeIdx, input, labels,
+          featuresMaskArray, labelsMaskArray);
+
+      setInput(subsets[0]);
+      setLabels(subsets[1]);
+      setLayerMaskArrays(subsets[2], subsets[3]);
+
+      if (solver == null) {
+        try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+          solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
+              .build();
         }
+      }
+      solver.optimize(workspaceMgr);
+
+      //Finally, update the state of the RNN layers:
+      updateRnnStateWithTBPTTState();
     }
 
-    /** Similar to rnnTimeStep and feedForward() methods. Difference here is that this method:<br>
-     * (a) like rnnTimeStep does forward pass using stored state for RNN layers, and<br>
-     * (b) unlike rnnTimeStep does not modify the RNN layer state<br>
-     * Therefore multiple calls to this method with the same input should have the same output.<br>
-     * Typically used during training only. Use rnnTimeStep for prediction/forward pass at test time.
-     * @param input Input to network
-     * @param training Whether training or not
-     * @param storeLastForTBPTT set to true if used as part of truncated BPTT training
-     * @return Activations for each layer (including input, as per feedforward() etc)
-     */
-    public List<INDArray> rnnActivateUsingStoredState(INDArray input, boolean training, boolean storeLastForTBPTT) {
-        return ffToLayerActivationsDetached(training, FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE, storeLastForTBPTT, layers.length-1, input, mask, null, false);
+    rnnClearPreviousState();
+    clearLayerMaskArrays();
+  }
+
+  private INDArray[] getSubsetsForTbptt(int startTimeIdx, int endTimeIdx, INDArray input,
+      INDArray labels,
+      INDArray fMask, INDArray lMask) {
+    INDArray[] out = new INDArray[4];
+    out[0] = input.get(NDArrayIndex.all(), NDArrayIndex.all(),
+        NDArrayIndex.interval(startTimeIdx, endTimeIdx));
+    out[1] = labels.get(NDArrayIndex.all(), NDArrayIndex.all(),
+        NDArrayIndex.interval(startTimeIdx, endTimeIdx));
+
+    if (fMask != null) {
+      out[2] = fMask.get(NDArrayIndex.all(),
+          NDArrayIndex.interval(startTimeIdx, endTimeIdx));
+    }
+    if (lMask != null) {
+      out[3] = lMask.get(NDArrayIndex.all(),
+          NDArrayIndex.interval(startTimeIdx, endTimeIdx));
     }
 
-    /** Get the updater for this MultiLayerNetwork
-     * @return Updater for MultiLayerNetwork
-     */
-    public Updater getUpdater() {
-        return getUpdater(true);
+    return out;
+  }
+
+  /**
+   * Intended for internal/developer use
+   */
+  public void updateRnnStateWithTBPTTState() {
+    for (int i = 0; i < layers.length; i++) {
+      if (layers[i] instanceof RecurrentLayer) {
+        RecurrentLayer l = ((RecurrentLayer) layers[i]);
+        l.rnnSetPreviousState(l.rnnGetTBPTTState());
+      } else if (layers[i] instanceof MultiLayerNetwork) {
+        ((MultiLayerNetwork) layers[i]).updateRnnStateWithTBPTTState();
+      }
+    }
+  }
+
+  /**
+   * Get the {@link TrainingListener}s set for this network, if any
+   *
+   * @return listeners set for this network
+   */
+  public Collection<TrainingListener> getListeners() {
+    return trainingListeners;
+  }
+
+  @Override
+  public void setListeners(Collection<TrainingListener> listeners) {
+    if (layers == null) {
+      init();
+    }
+    for (Layer layer : layers) {
+      layer.setListeners(listeners);
     }
 
-    public Updater getUpdater(boolean initializeIfReq) {
-        if (solver == null && initializeIfReq) {
-            synchronized(this){
-                if(solver == null) {    //May have been created while waiting for lock
-                    solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this).build();
-                    solver.getOptimizer().setUpdater(UpdaterCreator.getUpdater(this));
-                }
-            }
+    if (solver != null) {
+      solver.setListeners(listeners);
+    }
+
+    this.trainingListeners.clear();
+    if (listeners != null) {
+      this.trainingListeners.addAll(listeners);
+    }
+  }
+
+  @Override
+  public void setListeners(TrainingListener... listeners) {
+    Collection<TrainingListener> cListeners = new ArrayList<>();
+    //Check: user might have done setListeners(null) thinking this would clear the current listeners.
+    //This results in an TrainingListener[1] with a single null value -> results in a NPE later
+    if (listeners != null && listeners.length > 0) {
+      for (TrainingListener i : listeners) {
+        if (i != null) {
+          cListeners.add(i);
         }
-        if(solver != null) {
-            return solver.getOptimizer().getUpdater(initializeIfReq);
-        }
-        return null;
+      }
+    }
+    setListeners(cListeners);
+  }
+
+  /**
+   * @deprecated Use {@link #getListeners()}
+   */
+  @Deprecated
+  public Collection<TrainingListener> getTrainingListeners() {
+    return trainingListeners;
+  }
+
+  /**
+   * This method ADDS additional TrainingListener to existing listeners
+   *
+   * @param listeners
+   */
+  @Override
+  public void addListeners(TrainingListener... listeners) {
+    Collections.addAll(trainingListeners, listeners);
+
+    // fixme this is wrong, since it removes existing listeners from the solver
+    if (solver != null) {
+      solver.setListeners(this.trainingListeners);
+    }
+  }
+
+  /**
+   * Usable only for classification networks in conjunction with OutputLayer. Cannot be used with
+   * RnnOutputLayer, CnnLossLayer, or networks used for regression.<br> To get the raw output
+   * activations of the output layer, use {@link #output(INDArray)} or similar.<br>
+   * <br>
+   * Equivalent to argmax(this.output(input)): Returns the predicted class indices corresponding to
+   * the predictions for each example in the features array.
+   *
+   * @param d The input features to perform inference on
+   * @return The predicted class index for each example
+   */
+  @Override
+  public int[] predict(INDArray d) {
+    INDArray output = output(d, Layer.TrainingMode.TEST);
+
+    if (d.size(0) > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
     }
 
-    /** Set the updater for the MultiLayerNetwork */
-    public void setUpdater(Updater updater) {
-        if (solver == null) {
-            solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this).build();
-        }
-        solver.getOptimizer().setUpdater(updater);
+    Preconditions.checkState(output.rank() == 2,
+        "predict(INDArray) method can only be used on rank 2 output - got array with rank %s",
+        output.rank());
+    return output.argMax(1).toIntVector();
+  }
+
+  /**
+   * As per {@link #predict(INDArray)} but the returned values are looked up from the list of label
+   * names in the provided DataSet
+   */
+  @Override
+  public List<String> predict(org.nd4j.linalg.dataset.api.DataSet dataSet) {
+    Preconditions.checkState(dataSet.getLabelNamesList() != null,
+        "This method can only be used when the DataSet contains a label name list");
+    int[] intRet = predict(dataSet.getFeatures());
+    List<String> ret = new ArrayList<>();
+    for (int i = 0; i < intRet.length; i++) {
+      ret.add(i, dataSet.getLabelName(intRet[i]));
+    }
+    return ret;
+  }
+
+  /**
+   * Fit the model for one iteration on the provided data
+   *
+   * @param data   the examples to classify (one example in each row)
+   * @param labels the example labels(a binary outcome matrix)
+   */
+  @Override
+  public void fit(INDArray data, INDArray labels) {
+    fit(data, labels, null, null);
+  }
+
+  /**
+   * Fit the model for one iteration on the provided data
+   *
+   * @param features     the examples to classify (one example in each row)
+   * @param labels       the example labels(a binary outcome matrix)
+   * @param featuresMask The mask array for the features (used for variable length time series,
+   *                     etc). May be null.
+   * @param labelsMask   The mask array for the labels (used for variable length time series, etc).
+   *                     May be null.
+   */
+  public synchronized void fit(INDArray features, INDArray labels, INDArray featuresMask,
+      INDArray labelsMask) {
+    try {
+      fitHelper(features, labels, featuresMask, labelsMask);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  private void fitHelper(INDArray features, INDArray labels, INDArray featuresMask,
+      INDArray labelsMask) {
+    if (numParams() == 0) {
+      //No op: can't fit a network with 0 parameters
+      return;
     }
 
-    /**Set the mask arrays for features and labels. Mask arrays are typically used in situations such as one-to-many
-     * and many-to-one learning with recurrent neural networks, as well as for supporting time series of varying lengths
-     * within the same minibatch.<br>
-     * For example, with RNN data sets with input of shape [miniBatchSize,nIn,timeSeriesLength] and outputs of shape
-     * [miniBatchSize,nOut,timeSeriesLength], the features and mask arrays will have shape [miniBatchSize,timeSeriesLength]
-     * and contain values 0 or 1 at each element (to specify whether a given input/example is present - or merely padding -
-     * at a given time step).<br>
-     * <b>NOTE</b>: This method is not usually used directly. Instead, methods such as {@link #feedForward(INDArray, INDArray, INDArray)}
-     * and {@link #output(INDArray, boolean, INDArray, INDArray)} handle setting of masking internally.
-     * @param featuresMaskArray Mask array for features (input)
-     * @param labelsMaskArray Mask array for labels (output)
-     * @see #clearLayerMaskArrays()
-     */
-    public void setLayerMaskArrays(INDArray featuresMaskArray, INDArray labelsMaskArray) {
-        if (featuresMaskArray != null) {
+    setInput(features);
+    setLabels(labels);
+    this.setLayerMaskArrays(featuresMask, labelsMask);
+    update(TaskUtils.buildTask(features, labels));
 
-            if (featuresMaskArray.size(0) > Integer.MAX_VALUE)
-                throw new ND4JArraySizeException();
-            //New approach: use feedForwardMaskArray method
-            feedForwardMaskArray(featuresMaskArray, MaskState.Active, (int) featuresMaskArray.size(0));
+    LayerWorkspaceMgr workspaceMgr;
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == null) {
+      workspaceMgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      workspaceMgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          //Note for updater working memory, we have the option to re-use WS_ALL_LAYERS_ACT or FF/BP_WORKING_MEM
+          // these should be closed by the time updaters are executed
+          //Generally, WS_ALL_LAYERS_ACT will be the larger of the two, so we'll use this
+          .with(ArrayType.UPDATER_WORKING_MEM, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .build();
+    }
+    workspaceMgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    if (layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT) {
+      doTruncatedBPTT(features, labels, featuresMask, labelsMask, workspaceMgr);
+    } else {
+      if (solver == null) {
+        try (MemoryWorkspace wsO = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+          solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
+              .build();
+        }
+      }
+      //TODO CACHE WORKSPACE, IF USED???
+      solver.optimize(workspaceMgr);
+    }
+
+    clearLayerMaskArrays();
+    clearLayersStates();
+    synchronizeIterEpochCounts();
+  }
+
+  @Override
+  public void fit(INDArray data, LayerWorkspaceMgr workspaceMgr) {
+    throw new UnsupportedOperationException("Not supported: use pretrainLayer");
+  }
+
+  /**
+   * Fit the model for one iteration on the provided data
+   *
+   * @param data the data to train on
+   */
+  @Override
+  public void fit(org.nd4j.linalg.dataset.api.DataSet data) {
+    fit(data.getFeatures(), data.getLabels(), data.getFeaturesMaskArray(),
+        data.getLabelsMaskArray());
+  }
+
+  /**
+   * Fit the model for one iteration on the provided data
+   *
+   * @param examples the examples to classify (one example in each row)
+   * @param labels   the labels for each example (the number of labels must match
+   */
+  @Override
+  public void fit(INDArray examples, int[] labels) {
+    org.deeplearning4j.nn.conf.layers.OutputLayer layerConf =
+        (org.deeplearning4j.nn.conf.layers.OutputLayer) getOutputLayer().conf().getLayer();
+
+    if (layerConf.getNOut() > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    fit(examples, FeatureUtil.toOutcomeMatrix(labels, (int) layerConf.getNOut()));
+  }
+
+  /**
+   * Perform inference on the provided input/features - i.e., perform forward pass using the
+   * provided input/features and return the output of the final layer.
+   *
+   * @param input Input to the network
+   * @param train whether the output is test or train. This mainly affect hyper parameters such as
+   *              dropout and batch normalization, which have different behaviour for test vs.
+   *              train
+   * @return The network predictions - i.e., the activations of the final layer
+   */
+  public INDArray output(INDArray input, TrainingMode train) {
+    return output(input, train == TrainingMode.TRAIN);
+  }
+
+  /**
+   * Perform inference on the provided input/features - i.e., perform forward pass using the
+   * provided input/features and return the output of the final layer.
+   *
+   * @param input Input to the network
+   * @param train whether the output is test or train. This mainly affect hyper parameters such as
+   *              dropout and batch normalization, which have different behaviour for test vs.
+   *              train
+   * @return The network predictions - i.e., the activations of the final layer
+   */
+  public INDArray output(INDArray input, boolean train) {
+    return output(input, train, null, null);
+  }
+
+  /**
+   * Calculate the output of the network, with masking arrays. The masking arrays are used in
+   * situations such as one-to-many and many-to-one recurrent neural network (RNN) designs, as well
+   * as for supporting time series of varying lengths within the same minibatch.
+   */
+  public INDArray output(INDArray input, boolean train, INDArray featuresMask,
+      INDArray labelsMask) {
+    return output(input, train, featuresMask, labelsMask, null);
+  }
+
+  /**
+   * Get the network output, which is optionally placed in the specified memory workspace.<br> If no
+   * memory workspace is provided, the output will be detached (not in any workspace).<br> If a
+   * memory workspace is provided, the output activation array (i.e., the INDArray returned by this
+   * method) will be placed in the specified workspace. This workspace must be opened by the user
+   * before calling this method - and the user is responsible for (a) closing this workspace, and
+   * (b) ensuring the output array is not used out of scope (i.e., not used after closing the
+   * workspace to which it belongs - as this is likely to cause either an exception when used, or a
+   * crash).
+   *
+   * @param input           Input to the network
+   * @param train           True for train, false otherwise
+   * @param outputWorkspace May be null. If not null: the workspace MUST be opened before calling
+   *                        this method.
+   * @return The output/activations from the network (either detached or in the specified workspace
+   * if provided)
+   */
+  public INDArray output(INDArray input, boolean train, MemoryWorkspace outputWorkspace) {
+    return output(input, train, null, null, outputWorkspace);
+  }
+
+  /**
+   * Get the network output, which is optionally placed in the specified memory workspace.<br> If no
+   * memory workspace is provided, the output will be detached (not in any workspace).<br> If a
+   * memory workspace is provided, the output activation array (i.e., the INDArray returned by this
+   * method) will be placed in the specified workspace. This workspace must be opened by the user
+   * before calling this method - and the user is responsible for (a) closing this workspace, and
+   * (b) ensuring the output array is not used out of scope (i.e., not used after closing the
+   * workspace to which it belongs - as this is likely to cause either an exception when used, or a
+   * crash).
+   *
+   * @param input           Input to the network
+   * @param train           True for train, false otherwise
+   * @param outputWorkspace May be null. If not null: the workspace MUST be opened before calling
+   *                        this method.
+   * @return The output/activations from the network (either detached or in the specified workspace
+   * if provided)
+   */
+  public synchronized INDArray output(INDArray input, boolean train, INDArray featuresMask,
+      INDArray labelsMask, MemoryWorkspace outputWorkspace) {
+    try {
+      return outputOfLayerDetached(train, FwdPassType.STANDARD, layers.length - 1, input,
+          featuresMask, labelsMask, outputWorkspace);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  /**
+   * This method uses provided OutputAdapter to return custom object built from INDArray
+   * <p>
+   * PLEASE NOTE: This method uses dedicated Workspace for output generation to avoid redundant
+   * allocations
+   *
+   * @param inputs        Input arrays to the netwonk
+   * @param inputMasks    Optional input mask arrays (may be null)
+   * @param labelMasks    Optional label mask arrays (may be null
+   * @param outputAdapter OutputAdapter<T> instance
+   * @param <T>           T extends Object
+   * @return T instance produced by OutputAdapter
+   */
+  public synchronized <T> T output(@NonNull INDArray inputs, INDArray inputMasks,
+      INDArray labelMasks, @NonNull OutputAdapter<T> outputAdapter) {
+    try (val ws = Nd4j.getWorkspaceManager()
+        .getAndActivateWorkspace(WS_ALL_LAYERS_ACT_CONFIG, WS_OUTPUT_MEM)) {
+      if (outputAdapter instanceof ModelAdapter) {
+        return ((ModelAdapter<T>) outputAdapter).apply(this, new INDArray[]{inputs},
+            new INDArray[]{inputMasks}, new INDArray[]{labelMasks});
+      } else {
+        return outputAdapter.apply(output(inputs, false, inputMasks, labelMasks, ws));
+      }
+    }
+  }
+
+  /**
+   * Perform inference on the provided input/features - i.e., perform forward pass using the
+   * provided input/features and return the output of the final layer. Equivalent to
+   * {@link #output(INDArray, boolean)} with train=false - i.e., this method is used for inference.
+   *
+   * @param input Input to the network
+   * @return The network predictions - i.e., the activations of the final layer
+   */
+  public INDArray output(INDArray input) {
+    return output(input, TrainingMode.TEST);
+  }
+
+  /**
+   * Generate the output for all examples/batches in the input iterator, and concatenate them into a
+   * single array. See {@link #output(INDArray)}<br> NOTE 1: The output array can require a
+   * considerable amount of memory for iterators with a large number of examples<br> NOTE 2: This
+   * method cannot be used for variable length time series outputs, as this would require padding
+   * arrays for some outputs, or returning a mask array (which cannot be done with this method). For
+   * variable length time series applications, use one of the other output methods. This method also
+   * cannot be used with fully convolutional networks with different output sizes (for example,
+   * segmentation on different input image sizes).
+   *
+   * @param iterator Data to pass through the network
+   * @return output for all examples in the iterator, concatenated into a
+   */
+  public INDArray output(DataSetIterator iterator, boolean train) {
+    List<INDArray> outList = new ArrayList<>();
+    long[] firstOutputShape = null;
+    while (iterator.hasNext()) {
+      DataSet next = iterator.next();
+      INDArray features = next.getFeatures();
+
+      if (features == null) {
+        continue;
+      }
+
+      INDArray fMask = next.getFeaturesMaskArray();
+      INDArray lMask = next.getLabelsMaskArray();
+      INDArray output = this.output(features, train, fMask, lMask);
+      outList.add(output);
+      if (firstOutputShape == null) {
+        firstOutputShape = output.shape();
+      } else {
+        //Validate that shapes are the same (may not be, for some RNN variable length time series applications)
+        long[] currShape = output.shape();
+        Preconditions.checkState(firstOutputShape.length == currShape.length,
+            "Error during forward pass:" +
+                "different minibatches have different output array ranks - first minibatch shape %s, last minibatch shape %s",
+            firstOutputShape, currShape);
+        for (int i = 1; i < currShape.length;
+            i++) {    //Skip checking minibatch dimension, fine if this varies
+          Preconditions.checkState(firstOutputShape[i] == currShape[i],
+              "Current output shape does not match first" +
+                  " output array shape at position %s: all dimensions must match other than the first dimension.\n"
+                  +
+                  " For variable length output size/length use cases such as for RNNs with multiple sequence lengths,"
+                  +
+                  " use one of the other (non iterator) output methods. First batch output shape: %s, current batch output shape: %s",
+              i, firstOutputShape, currShape);
+        }
+      }
+    }
+    return Nd4j.concat(0, outList.toArray(new INDArray[outList.size()]));
+  }
+
+  /**
+   * Equivalent to {@link #output(DataSetIterator, boolean)} with train=false
+   */
+  public INDArray output(DataSetIterator iterator) {
+    return output(iterator, false);
+  }
+
+  /**
+   * Perform inference and then calculate the F1 score of the output(input) vs. the labels.
+   *
+   * @param input  the input to perform inference with
+   * @param labels the true labels
+   * @return the score for the given input,label pairs
+   */
+  @Override
+  public double f1Score(INDArray input, INDArray labels) {
+    feedForward(input);
+    setLabels(labels);
+    Evaluation eval = new Evaluation();
+    eval.eval(labels, output(input));
+    return eval.f1();
+  }
+
+  /**
+   * @deprecated Will be removed in a future release
+   */
+  @Deprecated
+  @Override
+  public int numLabels() {
+    return (int) labels.size(1);
+  }
+
+  /**
+   * Sets the input and labels and calculates the score (value of the output layer loss function
+   * plus l1/l2 if applicable) for the prediction with respect to the true labels<br> This is
+   * equivalent to {@link #score(DataSet, boolean)} with training==false.
+   *
+   * @param data the data to score
+   * @return the score for the given input,label pairs
+   * @see #score(DataSet, boolean)
+   */
+  public double score(DataSet data) {
+    return score(data, false);
+  }
+
+  /**
+   * Sets the input and labels and calculates the score (value of the output layer loss function
+   * plus l1/l2 if applicable) for the prediction with respect to the true labels<br>
+   *
+   * @param data     data to calculate score for
+   * @param training If true: score during training. If false: score at test time. This can affect
+   *                 the application of certain features, such as dropout and dropconnect (which are
+   *                 applied at training time only)
+   * @return the score (value of the loss function)
+   */
+  public double score(DataSet data, boolean training) {
+    try {
+      return scoreHelper(data, training);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  private double scoreHelper(DataSet data, boolean training) {
+    boolean hasMaskArray = data.hasMaskArrays();
+    if (hasMaskArray) {
+      setLayerMaskArrays(data.getFeaturesMaskArray(), data.getLabelsMaskArray());
+    }
+
+    if (!(getOutputLayer() instanceof IOutputLayer)) {
+      throw new IllegalStateException(
+          "Cannot calculate score if final layer is not an instance of IOutputLayer. " +
+              "Final layer is of type: " + getOutputLayer().getClass());
+    }
+
+    WorkspaceMode wsm = (training ? layerWiseConfigurations.getTrainingWorkspaceMode()
+        : layerWiseConfigurations.getInferenceWorkspaceMode());
+    LayerWorkspaceMgr mgr;
+    if (wsm == WorkspaceMode.NONE) {
+      mgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      mgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          //TODO we can probably optimize this
+          .noWorkspaceFor(ArrayType.ACTIVATIONS)
+          .noWorkspaceFor(ArrayType.INPUT)
+          .build();
+    }
+    mgr.setHelperWorkspacePointers(helperWorkspaces);
+
+    INDArray inputToOutputLayer = outputOfLayerDetached(training, FwdPassType.STANDARD,
+        layers.length - 2, data.getFeatures(),
+        data.getFeaturesMaskArray(), data.getLabelsMaskArray(), null);
+
+    if (data.getFeatures().size(0) > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    IOutputLayer ol = (IOutputLayer) getOutputLayer();
+    if (getLayerWiseConfigurations().getInputPreProcess(layers.length - 1) != null) {
+      inputToOutputLayer = getLayerWiseConfigurations().getInputPreProcess(layers.length - 1)
+          .preProcess(inputToOutputLayer, (int) data.getFeatures().size(0), mgr);
+    }
+    ol.setInput(inputToOutputLayer, mgr); //Feedforward doesn't include output layer for efficiency
+    ol.setLabels(data.getLabels());
+    double score;
+    try (MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
+      score = ol.computeScore(calcRegularizationScore(true), training, mgr);
+    }
+
+    if (hasMaskArray) {
+      clearLayerMaskArrays();
+    }
+    clearLayersStates();
+
+    return score;
+  }
+
+  /**
+   * As per {@link #scoreExamples(DataSet, boolean)} - the outputs (example scores) for all DataSets
+   * in the iterator are concatenated
+   */
+  public INDArray scoreExamples(DataSetIterator iter, boolean addRegularizationTerms) {
+    List<INDArray> out = new ArrayList<>();
+
+    while (iter.hasNext()) {
+      out.add(scoreExamples(iter.next(), addRegularizationTerms));
+    }
+    return Nd4j.toFlattened('f', out);
+  }
+
+  /**
+   * Calculate the score for each example in a DataSet individually. Unlike {@link #score(DataSet)}
+   * and {@link #score(DataSet, boolean)} this method does not average/sum over examples. This
+   * method allows for examples to be scored individually (at test time only), which may be useful
+   * for example for autoencoder architectures and the like.<br> Each row of the output (assuming
+   * addRegularizationTerms == true) is equivalent to calling score(DataSet) with a single example.
+   *
+   * @param data                   The data to score
+   * @param addRegularizationTerms If true: add l1/l2 regularization terms (if any) to the score. If
+   *                               false: don't add regularization terms
+   * @return An INDArray (column vector) of size input.numRows(); the ith entry is the score (loss
+   * value) of the ith example
+   */
+  public INDArray scoreExamples(DataSet data, boolean addRegularizationTerms) {
+    try {
+      return scoreExamplesHelper(data, addRegularizationTerms);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  private INDArray scoreExamplesHelper(DataSet data, boolean addRegularizationTerms) {
+    INDArray inputLast = outputOfLayerDetached(false, FwdPassType.STANDARD, layers.length - 2,
+        data.getFeatures(),
+        data.getFeaturesMaskArray(), data.getLabelsMaskArray(), null);
+    setLabels(data.getLabels());
+    setLayerMaskArrays(data.getFeaturesMaskArray(), data.getLabelsMaskArray());
+
+    //TODO we might want workspaces here?
+    LayerWorkspaceMgr mgr = LayerWorkspaceMgr.noWorkspaces();
+
+    INDArray out;
+    if (getOutputLayer() instanceof IOutputLayer) {
+      IOutputLayer ol = (IOutputLayer) getOutputLayer();
+      if (layerWiseConfigurations.getInputPreProcess(layers.length - 1) != null) {
+
+        if (data.getFeatures().size(0) > Integer.MAX_VALUE) {
+          throw new ND4JArraySizeException();
+        }
+        inputLast = layerWiseConfigurations.getInputPreProcess(layers.length - 1)
+            .preProcess(inputLast,
+                (int) data.getFeatures().size(0), mgr);
+      }
+      ol.setLabels(data.getLabels());
+      ol.setInput(inputLast, mgr);
+      double r = (addRegularizationTerms ? calcRegularizationScore(true) : 0);
+      out = ol.computeScoreForExamples(r, mgr);
+    } else {
+      throw new UnsupportedOperationException(
+          "Cannot calculate score with respect to labels without an OutputLayer");
+    }
+
+    clearLayersStates();
+    clearLayerMaskArrays();
+    return out;
+  }
+
+  @Override
+  public void fit() {
+    fit(input, labels);
+  }
+
+  @Override
+  public void update(INDArray gradient, String paramType) {
+    throw new UnsupportedOperationException("Not implemented");
+  }
+
+  /**
+   * Score of the model (relative to the objective function) - previously calculated on the last
+   * minibatch
+   *
+   * @return the score of the model (relative to the objective function)
+   */
+  @Override
+  public double score() {
+    return score;
+  }
+
+  /**
+   * Intended for developer/internal use
+   */
+  public void setScore(double score) {
+    this.score = score;
+  }
+
+  @Override
+  public void computeGradientAndScore(LayerWorkspaceMgr layerWorkspaceMgr) {
+    computeGradientAndScore();
+  }
+
+  public void computeGradientAndScore() {
+
+    if (!(getOutputLayer() instanceof IOutputLayer)) {
+      throw new DL4JException(
+          "Cannot calculate gradient and score with respect to labels: final layer is not an IOutputLayer. "
+              +
+              "Final layer class: " + getOutputLayer().getClass()
+              + ". To calculate gradients and fit a network " +
+              "using backpropagation, the final layer must be an output layer");
+    }
+
+    //Note: Workspace manager is only ose here for score calculation... other workspace managers are used in the
+    // various FF/backprop methds
+    LayerWorkspaceMgr mgr;
+    if (layerWiseConfigurations.getTrainingWorkspaceMode() == WorkspaceMode.NONE) {
+      mgr = LayerWorkspaceMgr.noWorkspaces();
+    } else {
+      mgr = LayerWorkspaceMgr.builder()
+          .with(ArrayType.INPUT, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.ACTIVATIONS, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG)
+          .with(ArrayType.FF_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.BP_WORKING_MEM, WS_LAYER_WORKING_MEM, WS_LAYER_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_FF_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .with(ArrayType.RNN_BP_LOOP_WORKING_MEM, WS_RNN_LOOP_WORKING_MEM,
+              WS_RNN_LOOP_WORKING_MEM_CONFIG)
+          .build();
+
+      if (layerWiseConfigurations.getCacheMode() != null) {
+        //For now: store cache mode activations in activations workspace
+        mgr.setWorkspace(ArrayType.FF_CACHE, WS_ALL_LAYERS_ACT, WS_ALL_LAYERS_ACT_CONFIG);
+      }
+    }
+
+    boolean tbptt = layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT;
+    FwdPassType fwdType = (tbptt ? FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE
+        : FwdPassType.STANDARD);
+    synchronizeIterEpochCounts();
+
+    //Calculate activations (which are stored in each layer, and used in backprop)
+    try (MemoryWorkspace ws = mgr.notifyScopeEntered(ArrayType.ACTIVATIONS)) {
+      //First: do a feed-forward through the network
+      //Note that we don't actually need to do the full forward pass through the output layer right now; but we do
+      // need the input to the output layer to be set (such that backprop can be done)
+      List<INDArray> activations = ffToLayerActivationsInWs(layers.length - 2, fwdType, tbptt,
+          input, mask, null);
+      if (!trainingListeners.isEmpty()) {
+        //TODO: We possibly do want output layer activations in some cases here...
+        for (TrainingListener tl : trainingListeners) {
+          tl.onForwardPass(this, activations);
+        }
+      }
+      INDArray inputToOutputLayer = activations.get(activations.size() - 1);
+      if (layerWiseConfigurations.getInputPreProcess(layers.length - 1) != null) {
+        inputToOutputLayer = layerWiseConfigurations.getInputPreProcess(layers.length - 1)
+            .preProcess(inputToOutputLayer, getInputMiniBatchSize(), mgr);
+        //Validate activations location
+      }
+      getOutputLayer().setInput(inputToOutputLayer, mgr);
+      //Then: compute gradients
+      Pair<Gradient, INDArray> pair = calcBackpropGradients(null, true, false, false);
+      this.gradient = (pair == null ? null : pair.getFirst());
+
+      //Calculate score
+      try (MemoryWorkspace wsFF = mgr.notifyScopeEntered(ArrayType.FF_WORKING_MEM)) {
+        double r = calcRegularizationScore(true);
+        score = ((IOutputLayer) getOutputLayer()).computeScore(r, true, mgr);
+      }
+
+      //Listeners
+      if (!trainingListeners.isEmpty()) {
+        try (MemoryWorkspace workspace = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+          for (TrainingListener tl : trainingListeners) {
+            tl.onBackwardPass(this);
+          }
+        }
+      }
+    }
+
+    //Clear the post noise/dropconnect parameters on the output layer
+    getOutputLayer().clearNoiseWeightParams();
+  }
+
+  /**
+   * Clear the inputs. Clears optimizer state.
+   */
+  public void clear() {
+    for (Layer layer : layers) {
+      layer.clear();
+    }
+
+    input = null;
+    labels = null;
+    solver = null;
+  }
+
+  @Override
+  public void applyConstraints(int iteration, int epoch) {
+    for (Layer l : layers) {
+      l.applyConstraints(iteration, epoch);
+    }
+  }
+
+  @Override
+  public void setInput(INDArray input, LayerWorkspaceMgr mgr) {
+    throw new UnsupportedOperationException("Not supported");
+  }
+
+  /**
+   * Get the output layer - i.e., the last layer in the netwok
+   *
+   * @return
+   */
+  public Layer getOutputLayer() {
+    Layer ret = getLayers()[getLayers().length - 1];
+    if (ret instanceof FrozenLayerWithBackprop) {
+      ret = ((FrozenLayerWithBackprop) ret).getInsideLayer();
+    }
+    return ret;
+  }
+
+
+  /**
+   * See {@link #setParams(INDArray)}
+   */
+  public void setParameters(INDArray params) {
+    setParams(params);
+  }
+
+  /**
+   * Intended for internal/developer use
+   */
+  public NeuralNetConfiguration getDefaultConfiguration() {
+    return defaultConfiguration;
+  }
+
+  public INDArray getLabels() {
+    return labels;
+  }
+
+  /**
+   * @param labels Labels to set
+   */
+  public void setLabels(INDArray labels) {
+    this.labels = labels;
+  }
+
+  public INDArray getInput() {
+    return input;
+  }
+
+  /**
+   * Set the input array for the network
+   *
+   * @param input Input array to set
+   */
+  public void setInput(INDArray input) {
+    this.input = input;
+    if (this.layers == null) {
+      init();
+    }
+    if (input != null) {
+      if (input.length() == 0) {
+        throw new IllegalArgumentException(
+            "Invalid input: length 0 (shape: " + Arrays.toString(input.shape()) + ")");
+      }
+
+      if (input.size(0) > Integer.MAX_VALUE) {
+        throw new ND4JArraySizeException();
+      }
+      setInputMiniBatchSize((int) input.size(0));
+    }
+  }
+
+  /**
+   * Get the number of layers in the network
+   *
+   * @return the number of layers in the network
+   */
+  public int getnLayers() {
+    return layerWiseConfigurations.getConfs().size();
+  }
+
+  /**
+   * @return The layers in the network
+   */
+  public synchronized Layer[] getLayers() {
+    return layers;
+  }
+
+  public void setLayers(Layer[] layers) {
+    this.layers = layers;
+  }
+
+  public Layer getLayer(int i) {
+    Preconditions.checkArgument(i >= 0 && i < layers.length,
+        "Invalid layer index: layer index must be 0" +
+            " to %s (inclusive), got index %s", layers.length - 1, i);
+    return layers[i];
+  }
+
+  public Layer getLayer(String name) {
+    return layerMap.get(name);
+  }
+
+  public List<String> getLayerNames() {
+    return new ArrayList<>(layerMap.keySet());
+  }
+
+  public INDArray getMask() {
+    return mask;
+  }
+
+  public void setMask(INDArray mask) {
+    this.mask = mask;
+  }
+
+  public INDArray getMaskArray() {
+    return mask;
+  }
+
+  @Override
+  public void setMaskArray(INDArray maskArray) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public boolean isPretrainLayer() {
+    return false;
+  }
+
+  @Override
+  public void clearNoiseWeightParams() {
+    for (Layer l : layers) {
+      l.clearNoiseWeightParams();
+    }
+  }
+
+  @Override
+  public void allowInputModification(boolean allow) {
+    throw new UnsupportedOperationException("Not supported");
+  }
+
+  //==========
+  //Layer methods
+
+  @Override
+  public Pair<INDArray, MaskState> feedForwardMaskArray(INDArray maskArray,
+      MaskState currentMaskState,
+      int minibatchSize) {
+    if (maskArray == null) {
+      for (int i = 0; i < layers.length; i++) {
+        layers[i].feedForwardMaskArray(null, null, minibatchSize);
+      }
+    } else {
+      //Do a forward pass through each preprocessor and layer
+      for (int i = 0; i < layers.length; i++) {
+        InputPreProcessor preProcessor = getLayerWiseConfigurations().getInputPreProcess(i);
+
+        if (preProcessor != null) {
+          Pair<INDArray, MaskState> p =
+              preProcessor.feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
+          if (p != null) {
+            maskArray = p.getFirst();
+            currentMaskState = p.getSecond();
+          } else {
+            maskArray = null;
+            currentMaskState = null;
+          }
+        }
+
+        Pair<INDArray, MaskState> p =
+            layers[i].feedForwardMaskArray(maskArray, currentMaskState, minibatchSize);
+        if (p != null) {
+          maskArray = p.getFirst();
+          currentMaskState = p.getSecond();
+        } else {
+          maskArray = null;
+          currentMaskState = null;
+        }
+      }
+    }
+
+    return new Pair<>(maskArray, currentMaskState);
+  }
+
+  @Override
+  public LayerHelper getHelper() {
+    throw new UnsupportedOperationException("Not supported");
+  }
+
+  @Override
+  public Type type() {
+    return Type.MULTILAYER;
+  }
+
+  /**
+   * Equivalent to {@link #output(INDArray)} using the input set via {@link #setInput(INDArray)}
+   */
+  public INDArray activate(TrainingMode training) {
+    return output(input, training == TrainingMode.TRAIN);
+  }
+
+  /**
+   * Equivalent to {@link #output(INDArray, TrainingMode)}
+   */
+  public INDArray activate(INDArray input, TrainingMode training) {
+    return output(input, training == TrainingMode.TRAIN);
+  }
+
+  @Override
+  public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon,
+      LayerWorkspaceMgr workspaceMgr) {
+    if (getOutputLayer() instanceof IOutputLayer) {
+      throw new UnsupportedOperationException(
+          "Cannot calculate gradients based on epsilon with OutputLayer");
+    }
+
+    return calcBackpropGradients(epsilon, false, false, true);
+  }
+
+  @Override
+  public int getIndex() {
+    return layerIndex;
+  }
+
+  @Override
+  public void setIndex(int index) {
+    layerIndex = index;
+  }
+
+  @Override
+  public int getIterationCount() {
+    return getLayerWiseConfigurations().getIterationCount();
+  }
+
+  @Override
+  public void setIterationCount(int iterationCount) {
+    getLayerWiseConfigurations().setIterationCount(iterationCount);
+  }
+
+  @Override
+  public int getEpochCount() {
+    return getLayerWiseConfigurations().getEpochCount();
+  }
+
+  @Override
+  public void setEpochCount(int epochCount) {
+    getLayerWiseConfigurations().setEpochCount(epochCount);
+  }
+
+  @Override
+  public double calcRegularizationScore(boolean backpropParamsOnly) {
+    double scoreSum = 0.0;
+    for (int i = 0; i < layers.length; i++) {
+      scoreSum += layers[i].calcRegularizationScore(backpropParamsOnly);
+    }
+    return scoreSum;
+  }
+
+  @Override
+  public void update(Gradient gradient) {
+    if (gradient.gradient().length() != numParams(true)) {
+      throw new IllegalArgumentException(
+          "Invalid input: expect gradients array of length " + numParams(true));
+    }
+    for (Map.Entry<String, INDArray> entry : gradient.gradientForVariable().entrySet()) {
+      String key = entry.getKey();
+      INDArray val = entry.getValue();
+      int idx = key.indexOf('_');
+      if (idx == -1) {
+        throw new IllegalStateException(
+            "Invalid param key: not have layer separator: \"" + key + "\"");
+      }
+      Integer layerId = Integer.parseInt(key.substring(0, idx));
+      String paramType = key.substring(idx + 1);
+      // Update MLN gradient
+      this.gradient.gradientForVariable().put(key, val);
+      // Update layer params
+      layers[layerId].update(val, paramType);
+    }
+    // Update layerwise gradient view
+    setBackpropGradientsViewArray(gradient.gradient());
+
+  }
+
+  @Override
+  public INDArray activate(boolean training, LayerWorkspaceMgr mgr) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public INDArray activate(INDArray input, boolean training, LayerWorkspaceMgr mgr) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int getInputMiniBatchSize() {
+    if (!conf().isMiniBatch()) {
+      return 1;
+    }
+
+    if (input.size(0) > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    return (int) input.size(0);
+  }
+
+  @Override
+  public void setInputMiniBatchSize(int size) {
+    if (layers != null) {
+      for (Layer l : layers) {
+        l.setInputMiniBatchSize(size);
+      }
+    }
+  }
+
+  /**
+   * If this MultiLayerNetwork contains one or more RNN layers: conduct forward pass (prediction)
+   * but using previous stored state for any RNN layers. The activations for the final step are also
+   * stored in the RNN layers for use next time rnnTimeStep() is called.<br> This method can be used
+   * to generate output one or more steps at a time instead of always having to do forward pass from
+   * t=0. Example uses are for streaming data, and for generating samples from network output one
+   * step at a time (where samples are then fed back into the network as input)<br> If no previous
+   * state is present in RNN layers (i.e., initially or after calling rnnClearPreviousState()), the
+   * default initialization (usually 0) is used.<br> Supports mini-batch (i.e., multiple
+   * predictions/forward pass in parallel) as well as for single examples.<br>
+   *
+   * @param input Input to network. May be for one or multiple time steps. For single time step:
+   *              input has shape [miniBatchSize,inputSize] or [miniBatchSize,inputSize,1].
+   *              miniBatchSize=1 for single example.<br> For multiple time steps:
+   *              [miniBatchSize,inputSize,inputTimeSeriesLength]
+   * @return Output activations. If output is RNN layer (such as RnnOutputLayer): if input has shape
+   * [miniBatchSize,inputSize] i.e., is 2d, output has shape [miniBatchSize,outputSize] (i.e., also
+   * 2d).<br> Otherwise output is 3d [miniBatchSize,outputSize,inputTimeSeriesLength] when using
+   * RnnOutputLayer.
+   * @see #rnnTimeStep(INDArray, MemoryWorkspace) For outputting the activations in the specified
+   * workspace
+   */
+  public INDArray rnnTimeStep(INDArray input) {
+    return rnnTimeStep(input, null);
+  }
+
+  /**
+   * See {@link #rnnTimeStep(INDArray)} for details<br> If no memory workspace is provided, the
+   * output will be detached (not in any workspace).<br> If a memory workspace is provided, the
+   * output activation array (i.e., the INDArray returned by this method) will be placed in the
+   * specified workspace. This workspace must be opened by the user before calling this method - and
+   * the user is responsible for (a) closing this workspace, and (b) ensuring the output array is
+   * not used out of scope (i.e., not used after closing the workspace to which it belongs - as this
+   * is likely to cause either an exception when used, or a crash).
+   *
+   * @param input           Input activations
+   * @param outputWorkspace Output workspace. May be null
+   * @return The output/activations from the network (either detached or in the specified workspace
+   * if provided)
+   */
+  public INDArray rnnTimeStep(INDArray input, MemoryWorkspace outputWorkspace) {
+    try {
+      boolean inputIs2d = input.rank() == 2;
+      INDArray out = outputOfLayerDetached(false, FwdPassType.RNN_TIMESTEP, layers.length - 1,
+          input, null, null, outputWorkspace);
+      if (inputIs2d && out.rank() == 3 && layers[layers.length - 1].type() == Type.RECURRENT) {
+        //Return 2d output with shape [miniBatchSize,nOut]
+        // instead of 3d output with shape [miniBatchSize,nOut,1]
+        return out.tensorAlongDimension(0, 1, 0);
+      }
+      return out;
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  /**
+   * Get the state of the RNN layer, as used in rnnTimeStep().
+   *
+   * @param layer Number/index of the layer.
+   * @return Hidden state, or null if layer is not an RNN layer
+   */
+  public Map<String, INDArray> rnnGetPreviousState(int layer) {
+    if (layer < 0 || layer >= layers.length) {
+      throw new IllegalArgumentException("Invalid layer number");
+    }
+    Layer l = layers[layer];
+    if (l instanceof org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer) {
+      l = ((org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer) l).getUnderlying();
+    }
+    if (!(l instanceof RecurrentLayer)) {
+      throw new IllegalArgumentException("Layer is not an RNN layer");
+    }
+    return ((RecurrentLayer) l).rnnGetPreviousState();
+  }
+
+  /**
+   * Set the state of the RNN layer.
+   *
+   * @param layer The number/index of the layer.
+   * @param state The state to set the specified layer to
+   */
+  public void rnnSetPreviousState(int layer, Map<String, INDArray> state) {
+    if (layer < 0 || layer >= layers.length) {
+      throw new IllegalArgumentException("Invalid layer number");
+    }
+    Layer l = layers[layer];
+    if (l instanceof org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer) {
+      l = ((org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer) l).getUnderlying();
+    }
+    if (!(l instanceof RecurrentLayer)) {
+      throw new IllegalArgumentException("Layer is not an RNN layer");
+    }
+    RecurrentLayer r = (RecurrentLayer) l;
+    r.rnnSetPreviousState(state);
+  }
+
+  /**
+   * Clear the previous state of the RNN layers (if any).
+   */
+  public void rnnClearPreviousState() {
+    if (layers == null) {
+      return;
+    }
+    for (int i = 0; i < layers.length; i++) {
+      if (layers[i] instanceof RecurrentLayer) {
+        ((RecurrentLayer) layers[i]).rnnClearPreviousState();
+      } else if (layers[i] instanceof MultiLayerNetwork) {
+        ((MultiLayerNetwork) layers[i]).rnnClearPreviousState();
+      } else if (layers[i] instanceof BaseWrapperLayer
+          && ((BaseWrapperLayer) layers[i]).getUnderlying() instanceof RecurrentLayer) {
+        ((RecurrentLayer) ((BaseWrapperLayer) layers[i]).getUnderlying()).rnnClearPreviousState();
+      }
+    }
+  }
+
+  /**
+   * Similar to rnnTimeStep and feedForward() methods. Difference here is that this method:<br> (a)
+   * like rnnTimeStep does forward pass using stored state for RNN layers, and<br> (b) unlike
+   * rnnTimeStep does not modify the RNN layer state<br> Therefore multiple calls to this method
+   * with the same input should have the same output.<br> Typically used during training only. Use
+   * rnnTimeStep for prediction/forward pass at test time.
+   *
+   * @param input             Input to network
+   * @param training          Whether training or not
+   * @param storeLastForTBPTT set to true if used as part of truncated BPTT training
+   * @return Activations for each layer (including input, as per feedforward() etc)
+   */
+  public List<INDArray> rnnActivateUsingStoredState(INDArray input, boolean training,
+      boolean storeLastForTBPTT) {
+    return ffToLayerActivationsDetached(training, FwdPassType.RNN_ACTIVATE_WITH_STORED_STATE,
+        storeLastForTBPTT, layers.length - 1, input, mask, null, false);
+  }
+
+  /**
+   * Get the updater for this MultiLayerNetwork
+   *
+   * @return Updater for MultiLayerNetwork
+   */
+  public Updater getUpdater() {
+    return getUpdater(true);
+  }
+
+  /**
+   * Set the updater for the MultiLayerNetwork
+   */
+  public void setUpdater(Updater updater) {
+    if (solver == null) {
+      solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this).build();
+    }
+    solver.getOptimizer().setUpdater(updater);
+  }
+
+  public Updater getUpdater(boolean initializeIfReq) {
+    if (solver == null && initializeIfReq) {
+      synchronized (this) {
+        if (solver == null) {    //May have been created while waiting for lock
+          solver = new Solver.Builder().configure(conf()).listeners(getListeners()).model(this)
+              .build();
+          solver.getOptimizer().setUpdater(UpdaterCreator.getUpdater(this));
+        }
+      }
+    }
+    if (solver != null) {
+      return solver.getOptimizer().getUpdater(initializeIfReq);
+    }
+    return null;
+  }
+
+  /**
+   * Set the mask arrays for features and labels. Mask arrays are typically used in situations such
+   * as one-to-many and many-to-one learning with recurrent neural networks, as well as for
+   * supporting time series of varying lengths within the same minibatch.<br> For example, with RNN
+   * data sets with input of shape [miniBatchSize,nIn,timeSeriesLength] and outputs of shape
+   * [miniBatchSize,nOut,timeSeriesLength], the features and mask arrays will have shape
+   * [miniBatchSize,timeSeriesLength] and contain values 0 or 1 at each element (to specify whether
+   * a given input/example is present - or merely padding - at a given time step).<br>
+   * <b>NOTE</b>: This method is not usually used directly. Instead, methods such as
+   * {@link #feedForward(INDArray, INDArray, INDArray)}
+   * and {@link #output(INDArray, boolean, INDArray, INDArray)} handle setting of masking
+   * internally.
+   *
+   * @param featuresMaskArray Mask array for features (input)
+   * @param labelsMaskArray   Mask array for labels (output)
+   * @see #clearLayerMaskArrays()
+   */
+  public void setLayerMaskArrays(INDArray featuresMaskArray, INDArray labelsMaskArray) {
+    if (featuresMaskArray != null) {
+
+      if (featuresMaskArray.size(0) > Integer.MAX_VALUE) {
+        throw new ND4JArraySizeException();
+      }
+      //New approach: use feedForwardMaskArray method
+      feedForwardMaskArray(featuresMaskArray, MaskState.Active, (int) featuresMaskArray.size(0));
 
 
             /*
@@ -3308,837 +3629,883 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, org.d
             // non-zero (i.e., activationFunction(0*weights + bias) != 0 in general)
             //This assumes that the time series input is masked - i.e., values are 0 at the padded time steps,
             // so we don't need to do anything for the recurrent layer
-            
+
             //Now, if mask array is 2d -> need to reshape to 1d (column vector) in the exact same order
             // as is done for 3d -> 2d time series reshaping
             INDArray reshapedFeaturesMask = TimeSeriesUtils.reshapeTimeSeriesMaskToVector(featuresMaskArray);
-            
+
             for( int i=0; i<layers.length-1; i++ ){
                 Type t = layers[i].type();
                 if( t == Type.CONVOLUTIONAL || t == Type.FEED_FORWARD ){
                     layers[i].setMaskArray(reshapedFeaturesMask);
                 } else if( t == Type.RECURRENT ) break;
-            
+
             }
             */
-        }
-        if (labelsMaskArray != null) {
-            if (!(getOutputLayer() instanceof IOutputLayer))
-                return;
-            layers[layers.length - 1].setMaskArray(labelsMaskArray);
-        }
+    }
+    if (labelsMaskArray != null) {
+      if (!(getOutputLayer() instanceof IOutputLayer)) {
+        return;
+      }
+      layers[layers.length - 1].setMaskArray(labelsMaskArray);
+    }
+  }
+
+  /**
+   * Remove the mask arrays from all layers.<br> See {@link #setLayerMaskArrays(INDArray, INDArray)}
+   * for details on mask arrays.
+   */
+  public void clearLayerMaskArrays() {
+    for (Layer layer : layers) {
+      layer.setMaskArray(null);
+    }
+  }
+
+  /**
+   * Evaluate the network (classification performance)
+   *
+   * @param iterator Iterator to evaluate on
+   * @return Evaluation object; results of evaluation on all examples in the data set
+   */
+  public <T extends Evaluation> T evaluate(@NonNull DataSetIterator iterator) {
+    return (T) evaluate(iterator, null);
+  }
+
+  /**
+   * Evaluate the network (classification performance). Can only be used with MultiDataSetIterator
+   * instances with a single input/output array
+   *
+   * @param iterator Iterator to evaluate on
+   * @return Evaluation object; results of evaluation on all examples in the data set
+   */
+  public Evaluation evaluate(@NonNull MultiDataSetIterator iterator) {
+    return evaluate(new MultiDataSetWrapperIterator(iterator));
+  }
+
+  /**
+   * Evaluate the network for regression performance
+   *
+   * @param iterator Data to evaluate on
+   * @return Regression evaluation
+   */
+  public <T extends RegressionEvaluation> T evaluateRegression(DataSetIterator iterator) {
+    return (T) doEvaluation(iterator, new RegressionEvaluation(iterator.totalOutcomes()))[0];
+  }
+
+  /**
+   * Evaluate the network for regression performance Can only be used with MultiDataSetIterator
+   * instances with a single input/output array
+   *
+   * @param iterator Data to evaluate on
+   */
+  public org.nd4j.evaluation.regression.RegressionEvaluation evaluateRegression(
+      MultiDataSetIterator iterator) {
+    return evaluateRegression(new MultiDataSetWrapperIterator(iterator));
+  }
+
+  /**
+   * @deprecated To be removed - use {@link #evaluateROC(DataSetIterator, int)} to enforce selection
+   * of appropriate ROC/threshold configuration
+   */
+  @Deprecated
+  public <T extends ROC> T evaluateROC(DataSetIterator iterator) {
+    return evaluateROC(iterator, 0);
+  }
+
+  /**
+   * Evaluate the network (must be a binary classifier) on the specified data, using the {@link ROC}
+   * class
+   *
+   * @param iterator          Data to evaluate on
+   * @param rocThresholdSteps Number of threshold steps to use with {@link ROC} - see that class for
+   *                          details.
+   * @return ROC evaluation on the given dataset
+   */
+  public <T extends ROC> T evaluateROC(DataSetIterator iterator, int rocThresholdSteps) {
+    Layer outputLayer = getOutputLayer();
+    if (getLayerWiseConfigurations().isValidateOutputLayerConfig()) {
+      OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(),
+          ROC.class);
+    }
+    return (T) doEvaluation(iterator, new org.deeplearning4j.eval.ROC(rocThresholdSteps))[0];
+  }
+
+  /**
+   * @deprecated To be removed - use {@link #evaluateROCMultiClass(DataSetIterator, int)} to enforce
+   * selection of appropriate ROC/threshold configuration
+   */
+  @Deprecated
+  public <T extends ROCMultiClass> T evaluateROCMultiClass(DataSetIterator iterator) {
+    return evaluateROCMultiClass(iterator, 0);
+  }
+
+  /**
+   * Evaluate the network on the specified data, using the {@link ROCMultiClass} class
+   *
+   * @param iterator          Data to evaluate on
+   * @param rocThresholdSteps Number of threshold steps to use with {@link ROCMultiClass}
+   * @return Multi-class ROC evaluation on the given dataset
+   */
+  public <T extends ROCMultiClass> T evaluateROCMultiClass(DataSetIterator iterator,
+      int rocThresholdSteps) {
+    Layer outputLayer = getOutputLayer();
+    if (getLayerWiseConfigurations().isValidateOutputLayerConfig()) {
+      OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(),
+          ROCMultiClass.class);
+    }
+    return (T) doEvaluation(iterator,
+        new org.deeplearning4j.eval.ROCMultiClass(rocThresholdSteps))[0];
+  }
+
+  /**
+   * Perform evaluation using an arbitrary IEvaluation instance.
+   *
+   * @param iterator data to evaluate on
+   */
+  public <T extends IEvaluation> T[] doEvaluation(DataSetIterator iterator, T... evaluations) {
+    try {
+      return doEvaluationHelper(iterator, evaluations);
+    } catch (OutOfMemoryError e) {
+      CrashReportingUtil.writeMemoryCrashDump(this, e);
+      throw e;
+    }
+  }
+
+  public <T extends IEvaluation> T[] doEvaluationHelper(DataSetIterator iterator,
+      T... evaluations) {
+    if (!iterator.hasNext() && iterator.resetSupported()) {
+      iterator.reset();
     }
 
-    /** Remove the mask arrays from all layers.<br>
-     * See {@link #setLayerMaskArrays(INDArray, INDArray)} for details on mask arrays.
-     */
-    public void clearLayerMaskArrays() {
-        for (Layer layer : layers) {
-            layer.setMaskArray(null);
-        }
+    DataSetIterator iter =
+        iterator.asyncSupported() ? new AsyncDataSetIterator(iterator, 2, true) : iterator;
+
+    WorkspaceMode cMode = layerWiseConfigurations.getTrainingWorkspaceMode();
+    layerWiseConfigurations.setTrainingWorkspaceMode(
+        layerWiseConfigurations.getInferenceWorkspaceMode());
+
+    //First: let's determine if we should do 'split feed forward' for long time series
+    //The idea: RNN 20k time steps. Train using TBPTT length 100 -> 200 segments of length 100. If we naively
+    // just use .output(INDArray) here, then our memory requirements are 200x larger than if we did the same
+    // evaluation in segments...
+    //Only do this if TBPTT is enabled - if not, it means we can train without TBPTT and hence should be able
+    // to test without splitting also
+    boolean useRnnSegments = (layerWiseConfigurations.getBackpropType()
+        == BackpropType.TruncatedBPTT);
+
+    MemoryWorkspace outputWs;
+    if (getLayerWiseConfigurations().getInferenceWorkspaceMode() == WorkspaceMode.ENABLED) {
+      outputWs = Nd4j.getWorkspaceManager()
+          .getWorkspaceForCurrentThread(WS_ALL_LAYERS_ACT_CONFIG, WS_OUTPUT_MEM);
+    } else {
+      outputWs = new DummyWorkspace();
     }
 
-    /**
-     * Evaluate the network (classification performance)
-     *
-     * @param iterator Iterator to evaluate on
-     * @return Evaluation object; results of evaluation on all examples in the data set
-     */
-    public <T extends Evaluation> T evaluate(@NonNull DataSetIterator iterator) {
-        return (T)evaluate(iterator, null);
-    }
+    while (iter.hasNext()) {
+      DataSet next = iter.next();
 
-    /**
-     * Evaluate the network (classification performance).
-     * Can only be used with MultiDataSetIterator instances with a single input/output array
-     *
-     * @param iterator Iterator to evaluate on
-     * @return Evaluation object; results of evaluation on all examples in the data set
-     */
-    public Evaluation evaluate(@NonNull MultiDataSetIterator iterator) {
-        return evaluate(new MultiDataSetWrapperIterator(iterator));
-    }
+      if (next.getFeatures() == null || next.getLabels() == null) {
+        continue;
+      }
 
-    /**
-     * Evaluate the network for regression performance
-     * @param iterator Data to evaluate on
-     * @return Regression evaluation
-     */
-    public <T extends RegressionEvaluation> T evaluateRegression(DataSetIterator iterator) {
-        return (T)doEvaluation(iterator, new RegressionEvaluation(iterator.totalOutcomes()))[0];
-    }
+      INDArray features = next.getFeatures();
+      INDArray labels = next.getLabels();
+      INDArray fMask = next.getFeaturesMaskArray();
+      INDArray lMask = next.getLabelsMaskArray();
+      List<Serializable> meta = next.getExampleMetaData();
 
-    /**
-     * Evaluate the network for regression performance
-     * Can only be used with MultiDataSetIterator instances with a single input/output array
-     * @param iterator Data to evaluate on
-     */
-    public org.nd4j.evaluation.regression.RegressionEvaluation evaluateRegression(MultiDataSetIterator iterator) {
-        return evaluateRegression(new MultiDataSetWrapperIterator(iterator));
-    }
+      if (!useRnnSegments) {
+        //Standard/non-RNN case:
+        try (MemoryWorkspace ws = outputWs.notifyScopeEntered()) {
+          INDArray out = outputOfLayerDetached(false, FwdPassType.STANDARD, layers.length - 1,
+              features, fMask, lMask, ws);
 
-    /**
-     * @deprecated To be removed - use {@link #evaluateROC(DataSetIterator, int)} to enforce selection of appropriate ROC/threshold configuration
-     */
-    @Deprecated
-    public <T extends ROC> T evaluateROC(DataSetIterator iterator){
-        return evaluateROC(iterator, 0);
-    }
-
-    /**
-     * Evaluate the network (must be a binary classifier) on the specified data, using the {@link ROC} class
-     *
-     * @param iterator          Data to evaluate on
-     * @param rocThresholdSteps Number of threshold steps to use with {@link ROC} - see that class for details.
-     * @return ROC evaluation on the given dataset
-     */
-    public <T extends ROC> T evaluateROC(DataSetIterator iterator, int rocThresholdSteps) {
-        Layer outputLayer = getOutputLayer();
-        if(getLayerWiseConfigurations().isValidateOutputLayerConfig()){
-            OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROC.class);
-        }
-        return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROC(rocThresholdSteps))[0];
-    }
-
-    /**
-     * @deprecated To be removed - use {@link #evaluateROCMultiClass(DataSetIterator, int)} to enforce selection of appropriate ROC/threshold configuration
-     */
-    @Deprecated
-    public <T extends ROCMultiClass> T evaluateROCMultiClass(DataSetIterator iterator) {
-        return evaluateROCMultiClass(iterator, 0);
-    }
-
-    /**
-     * Evaluate the network on the specified data, using the {@link ROCMultiClass} class
-     *
-     * @param iterator          Data to evaluate on
-     * @param rocThresholdSteps Number of threshold steps to use with {@link ROCMultiClass}
-     * @return Multi-class ROC evaluation on the given dataset
-     */
-    public <T extends ROCMultiClass> T evaluateROCMultiClass(DataSetIterator iterator, int rocThresholdSteps) {
-        Layer outputLayer = getOutputLayer();
-        if(getLayerWiseConfigurations().isValidateOutputLayerConfig()){
-            OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), ROCMultiClass.class);
-        }
-        return (T)doEvaluation(iterator, new org.deeplearning4j.eval.ROCMultiClass(rocThresholdSteps))[0];
-    }
-
-    /**
-     * Perform evaluation using an arbitrary IEvaluation instance.
-     *
-     * @param iterator   data to evaluate on
-     */
-    public <T extends IEvaluation> T[] doEvaluation(DataSetIterator iterator, T... evaluations) {
-        try{
-            return doEvaluationHelper(iterator, evaluations);
-        } catch (OutOfMemoryError e){
-            CrashReportingUtil.writeMemoryCrashDump(this, e);
-            throw e;
-        }
-    }
-
-    public <T extends IEvaluation> T[] doEvaluationHelper(DataSetIterator iterator, T... evaluations) {
-        if (!iterator.hasNext() && iterator.resetSupported()) {
-            iterator.reset();
-        }
-
-        DataSetIterator iter = iterator.asyncSupported() ? new AsyncDataSetIterator(iterator, 2, true) : iterator;
-
-        WorkspaceMode cMode = layerWiseConfigurations.getTrainingWorkspaceMode();
-        layerWiseConfigurations.setTrainingWorkspaceMode(layerWiseConfigurations.getInferenceWorkspaceMode());
-
-        //First: let's determine if we should do 'split feed forward' for long time series
-        //The idea: RNN 20k time steps. Train using TBPTT length 100 -> 200 segments of length 100. If we naively
-        // just use .output(INDArray) here, then our memory requirements are 200x larger than if we did the same
-        // evaluation in segments...
-        //Only do this if TBPTT is enabled - if not, it means we can train without TBPTT and hence should be able
-        // to test without splitting also
-        boolean useRnnSegments = (layerWiseConfigurations.getBackpropType() == BackpropType.TruncatedBPTT);
-
-        MemoryWorkspace outputWs;
-        if(getLayerWiseConfigurations().getInferenceWorkspaceMode() == WorkspaceMode.ENABLED){
-            outputWs = Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread(WS_ALL_LAYERS_ACT_CONFIG, WS_OUTPUT_MEM);
-        } else {
-            outputWs = new DummyWorkspace();
-        }
-
-        while (iter.hasNext()) {
-            DataSet next = iter.next();
-
-            if (next.getFeatures() == null || next.getLabels() == null)
-                continue;
-
-
-            INDArray features = next.getFeatures();
-            INDArray labels = next.getLabels();
-            INDArray fMask = next.getFeaturesMaskArray();
-            INDArray lMask = next.getLabelsMaskArray();
-            List<Serializable> meta = next.getExampleMetaData();
-
-
-            if (!useRnnSegments) {
-                //Standard/non-RNN case:
-                try (MemoryWorkspace ws = outputWs.notifyScopeEntered()) {
-                    INDArray out = outputOfLayerDetached(false, FwdPassType.STANDARD, layers.length - 1, features, fMask, lMask, ws);
-
-                    try (MemoryWorkspace wsO = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
-                        for (T evaluation : evaluations)
-                            evaluation.eval(labels, out, lMask, meta);
-                    }
-                }
-            } else {
-                rnnClearPreviousState();
-
-
-                //Get subset of features and labels:
-                val fwdLen = layerWiseConfigurations.getTbpttFwdLength();
-                val tsLength = features.size(2);
-                long nSubsets = tsLength / fwdLen;
-                if (tsLength % fwdLen != 0)
-                    nSubsets++; //Example: 100 fwdLen with timeSeriesLength=120 -> want 2 subsets (1 of size 100, 1 of size 20)
-                for (int i = 0; i < nSubsets; i++) {
-                    val startTimeIdx = i * fwdLen;
-                    val endTimeIdx = Math.min(startTimeIdx + fwdLen, tsLength);
-
-                    if (endTimeIdx > Integer.MAX_VALUE)
-                        throw new ND4JArraySizeException();
-                    INDArray[] subsets = getSubsetsForTbptt(startTimeIdx, (int) endTimeIdx, features, labels, fMask, lMask);
-
-                    setLayerMaskArrays(subsets[2], subsets[3]);
-
-                    try (MemoryWorkspace ws = outputWs.notifyScopeEntered()) {
-                        INDArray outSub = rnnTimeStep(subsets[0], ws);
-                        try (MemoryWorkspace wsO = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
-                            for (T evaluation : evaluations)
-                                evaluation.eval(subsets[1], outSub, subsets[3]);
-                        }
-                    }
-                }
+          try (MemoryWorkspace wsO = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
+            for (T evaluation : evaluations) {
+              evaluation.eval(labels, out, lMask, meta);
             }
-
-            //Clear inputs, masks etc. Important to avoid leaking invalidated/out of scope arrays between iterations
-            clearLayersStates();
+          }
         }
+      } else {
+        rnnClearPreviousState();
 
-        if (iterator.asyncSupported())
-            ((AsyncDataSetIterator) iter).shutdown();
-
-        layerWiseConfigurations.setTrainingWorkspaceMode(cMode);
-
-        return evaluations;
-    }
-
-    /**
-     * Evaluate the network on the provided data set. Used for evaluating the performance of classifiers
-     *
-     * @param iterator Data to undertake evaluation on
-     * @return Evaluation object, summarizing the results of the evaluation on the provided DataSetIterator
-     */
-    public Evaluation evaluate(DataSetIterator iterator, List<String> labelsList) {
-        return evaluate(iterator, labelsList, 1);
-    }
-
-    @Override
-    public INDArray updaterState() {
-        return getUpdater() != null ? getUpdater().getStateViewArray() : null;
-    }
-
-    @Override
-    public void fit(MultiDataSet dataSet) {
-        if (dataSet.getFeatures().length == 1 && dataSet.getLabels().length == 1) {
-            INDArray features = dataSet.getFeatures(0);
-            INDArray labels = dataSet.getLabels(0);
-            INDArray fMask = null;
-            INDArray lMask = null;
-
-            if (dataSet.getFeaturesMaskArrays() != null)
-                fMask = dataSet.getFeaturesMaskArrays()[0];
-
-            if (dataSet.getFeaturesMaskArrays() != null)
-                lMask = dataSet.getLabelsMaskArrays()[0];
-
-            DataSet ds = new DataSet(features, labels, fMask, lMask);
-            fit(ds);
-        } else {
-            throw new DL4JInvalidInputException(
-                    "MultiLayerNetwork can't handle MultiDataSet with more than 1 features or labels array." +
-                            "Please consider use of ComputationGraph");
+        //Get subset of features and labels:
+        val fwdLen = layerWiseConfigurations.getTbpttFwdLength();
+        val tsLength = features.size(2);
+        long nSubsets = tsLength / fwdLen;
+        if (tsLength % fwdLen != 0) {
+          nSubsets++; //Example: 100 fwdLen with timeSeriesLength=120 -> want 2 subsets (1 of size 100, 1 of size 20)
         }
-    }
+        for (int i = 0; i < nSubsets; i++) {
+          val startTimeIdx = i * fwdLen;
+          val endTimeIdx = Math.min(startTimeIdx + fwdLen, tsLength);
 
-    /**
-     * Perform minibatch training on all minibatches in the MultiDataSetIterator, for the specified number of epochs.
-     * Equvalent to calling {@link #fit(MultiDataSetIterator)} numEpochs times in a loop
-     *
-     * @param iterator  Training data (DataSetIterator). Iterator must support resetting
-     * @param numEpochs Number of training epochs, >= 1
-     */
-    public void fit(@NonNull MultiDataSetIterator iterator, int numEpochs){
-        Preconditions.checkArgument(numEpochs > 0, "Number of epochs much be > 0. Got numEpochs = %s", numEpochs);
-        Preconditions.checkArgument(numEpochs == 1 || iterator.resetSupported(), "Cannot perform multiple epochs training using" +
-                "iterator has does not support resetting (iterator.resetSupported() returned false)");
-
-        for(int i = 0; i < numEpochs; i++) {
-            fit(iterator);
-        }
-    }
-
-    /**
-     * Perform minibatch training on all minibatches in the MultiDataSetIterator.<br>
-     * Note: The MultiDataSets in the MultiDataSetIterator must have exactly 1 input and output array (as
-     * MultiLayerNetwork only supports 1 input and 1 output)
-     *
-     * @param iterator  Training data (DataSetIterator). Iterator must support resetting
-     */
-    @Override
-    public void fit(MultiDataSetIterator iterator) {
-        fit(new MultiDataSetWrapperIterator(iterator));
-    }
-
-    @Override
-    public <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T[] evaluations) {
-        return doEvaluation(new MultiDataSetWrapperIterator(iterator), evaluations);
-    }
-
-    /**
-     * Evaluate the network (for classification) on the provided data set, with top N accuracy in addition to standard accuracy.
-     * For 'standard' accuracy evaluation only, use topN = 1
-     *
-     * @param iterator   Iterator (data) to evaluate on
-     * @param labelsList List of labels. May be null.
-     * @param topN       N value for top N accuracy evaluation
-     * @return Evaluation object, summarizing the results of the evaluation on the provided DataSetIterator
-     */
-    public Evaluation evaluate(DataSetIterator iterator, List<String> labelsList, int topN) {
-        if (layers == null || !(getOutputLayer() instanceof IOutputLayer)) {
-            throw new IllegalStateException("Cannot evaluate network with no output layer");
-        }
-        if (labelsList == null) {
-            try {
-                labelsList = iterator.getLabels();
-            } catch (Throwable t){ }    //Ignore, maybe UnsupportedOperationException etc
-        }
-
-        Layer outputLayer = getOutputLayer();
-        if(getLayerWiseConfigurations().isValidateOutputLayerConfig()){
-            OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(), Evaluation.class);
-        }
-
-        Evaluation e = new org.deeplearning4j.eval.Evaluation(labelsList, topN);
-        doEvaluation(iterator, e);
-
-        return e;
-    }
-
-    protected void update(Task task) {
-        if (!initDone) {
-            initDone = true;
-            Heartbeat heartbeat = Heartbeat.getInstance();
-            task = ModelSerializer.taskByModel(this);
-            Environment env = EnvironmentUtils.buildEnvironment();
-            heartbeat.reportEvent(Event.STANDALONE, env, task);
-        }
-    }
-
-    /**
-     * String detailing the architecture of the multilayernetwork.
-     * Columns are LayerIndex with layer type, nIn, nOut, Total number of parameters and the Shapes of the parameters
-     * Will also give information about frozen layers, if any.
-     * @return Summary as a string
-     * @see #memoryInfo(int, InputType)
-     */
-    public String summary() {
-        return summary(null);
-    }
-
-    /**
-     * String detailing the architecture of the multilayernetwork.
-     * Will also display activation size when given an input type.
-     * Columns are LayerIndex with layer type, nIn, nOut, Total number of parameters, Shapes of the parameters, Input activation shape, Output activation shape
-     * Will also give information about frozen layers, if any.
-     * @return Summary as a string
-     * @see #memoryInfo(int, InputType)
-     */
-    public String summary(InputType inputType) {
-        StringBuilder ret = new StringBuilder();
-        ret.append("\n");
-
-        List<String[]> lines = new ArrayList<>();
-        if(inputType == null){
-            lines.add(new String[]{"LayerName (LayerType)", "nIn,nOut", "TotalParams", "ParamsShape"});
-        } else {
-            lines.add(new String[]{"LayerName (LayerType)", "nIn,nOut", "TotalParams", "ParamsShape", "InputShape", "OutputShape"});
-        }
-        int[] maxLength = new int[inputType == null ? 4 : 6];
-        String[] header = lines.get(0);
-        for( int i=0; i<header.length; i++ ){
-            maxLength[i] = header[i].length();
-        }
-
-        int frozenParams = 0;
-        for (org.deeplearning4j.nn.api.Layer currentLayer : getLayers()) {
-            String name = currentLayer.conf().getLayer().getLayerName();
-            if (name == null) {
-                name = String.valueOf(currentLayer.getIndex());
-            }
-            String paramShape = "-";
-            String in = "-";
-            String out = "-";
-            String[] classNameArr = currentLayer.getClass().getName().split("\\.");
-            String className = classNameArr[classNameArr.length - 1];
-            String paramCount = String.format("%,d", currentLayer.numParams());
-            String inShape = "";
-            String outShape = "";
-            InputPreProcessor preProcessor;
-            InputType outType;
-            if (inputType != null) {
-                preProcessor = getLayerWiseConfigurations().getInputPreProcess(currentLayer.getIndex());
-                inShape = inputType.toString();
-                if (preProcessor != null) {
-                    inputType = preProcessor.getOutputType(inputType);
-                    inShape += "--> "+ inputType.toString();
-                }
-                outType = currentLayer.conf().getLayer().getOutputType(currentLayer.getIndex(), inputType);
-                outShape = outType.toString();
-                inputType = outType;
-            }
-            if (currentLayer.numParams() > 0) {
-                paramShape = "";
-                if (currentLayer instanceof BidirectionalLayer) { // Bidirectional layer is not an FFL
-                    BidirectionalLayer bi = (BidirectionalLayer) currentLayer;
-                    in = String.valueOf(((Bidirectional)bi.conf().getLayer()).getNIn());
-                    out = String.valueOf(((Bidirectional)bi.conf().getLayer()).getNOut());
-                } else {
-                    try {
-                        in = String.valueOf(((FeedForwardLayer) currentLayer.conf().getLayer()).getNIn());
-                        out = String.valueOf(((FeedForwardLayer) currentLayer.conf().getLayer()).getNOut());
-                    }
-                    catch (Exception e) { // Some layers, like PReLU, are just BaseLayers (but have parameters)
-                    }
-                }
-                Set<String> paraNames = currentLayer.paramTable().keySet();
-                for (String aP : paraNames) {
-                    String paramS = ArrayUtils.toString(currentLayer.paramTable().get(aP).shape());
-                    paramShape += aP + ":" + paramS + ", ";
-                }
-                paramShape = paramShape.subSequence(0, paramShape.lastIndexOf(",")).toString();
-            }
-            if (currentLayer instanceof FrozenLayer) {
-                frozenParams += currentLayer.numParams();
-                classNameArr = ((FrozenLayer) currentLayer).getInsideLayer().getClass().getName().split("\\.");
-                className = "Frozen " + classNameArr[classNameArr.length - 1];
-            }
-
-            String[] line;
-            if (inputType == null) {
-                line = new String[]{name + " (" + className + ")", in + "," + out, paramCount, paramShape};
-            } else {
-                line = new String[]{name + " (" + className + ")", in + "," + out, paramCount,paramShape,inShape,outShape};
-            }
-            for( int i=0; i<line.length; i++ ){
-                maxLength[i] = Math.max(maxLength[i], line[i] == null ? 0 : line[i].length());
-            }
-            lines.add(line);
-        }
-
-        StringBuilder sbFormat = new StringBuilder();
-        int totalLength = 0;
-        int pos = 0;
-        for(int length : maxLength){
-            int currLength;
-            if(pos++ == maxLength.length-1){
-                currLength = length;
-            } else {
-                currLength = length+3;
-            }
-            sbFormat.append("%-").append(currLength).append("s");
-            totalLength += currLength;
-        }
-        sbFormat.append("\n");
-        String format = sbFormat.toString();
-
-
-
-        ret.append(StringUtils.repeat("=", totalLength))
-                .append("\n");
-
-        boolean first = true;
-        for(String[] line : lines){
-            String formatted = String.format(format, (Object[])line);
-            ret.append(formatted);
-            if(first){
-                ret.append(StringUtils.repeat("=", totalLength)).append("\n");
-                first = false;
-            }
-        }
-
-        ret.append(StringUtils.repeat("-", totalLength));
-        ret.append(String.format("\n%30s %,d", "Total Parameters: ", params().length()));
-        ret.append(String.format("\n%30s %,d", "Trainable Parameters: ", params().length() - frozenParams));
-        ret.append(String.format("\n%30s %,d", "Frozen Parameters: ", frozenParams));
-        ret.append("\n");
-        ret.append(StringUtils.repeat("=", totalLength));
-        ret.append("\n");
-        return ret.toString();
-    }
-
-    /**
-     * Generate information regarding memory use for the network, for the given input type and minibatch size.
-     * Note that when using workspaces or CuDNN, the network should be trained for some iterations so that the memory
-     * workspaces have time to initialize. Without this, the memory requirements during training may be underestimated.
-     *
-     * Note also that this is the same information that is generated during an OOM crash when training or performing
-     * inference.
-     *
-     * @param minibatch    Minibatch size to estimate memory for
-     * @param inputType    Input type to the network
-     * @return A String with information about network memory use information
-     */
-    public String memoryInfo(int minibatch, InputType inputType){
-        return CrashReportingUtil.generateMemoryStatus(this, minibatch, inputType);
-    }
-
-    /**
-     * This method just makes sure there's no state preserved within layers
-     */
-    public void clearLayersStates() {
-        for (Layer layer : layers) {
-            layer.clear();
-            layer.clearNoiseWeightParams();
-        }
-    }
-
-    /**
-     * Increment the epoch count (in the underlying {@link MultiLayerConfiguration} by 1).
-     * Note that this is done <i>automatically</i> when using iterator-based fitting methods, such as
-     * {@link #fit(DataSetIterator)}. However, when using non-iterator fit methods (DataSet, INDArray/INDArray etc),
-     * the network has no way to know when one epoch ends and another starts. In such situations, this method
-     * can be used to increment the epoch counter.<br>
-     * Note that the epoch counter is used for situations such as some learning rate schedules, and the like.
-     *
-     * The current epoch count can be obtained using {@code MultiLayerConfiguration.getLayerwiseConfiguration().getEpochCount()}
-     */
-    public void incrementEpochCount(){
-        layerWiseConfigurations.setEpochCount(layerWiseConfigurations.getEpochCount() + 1);
-        synchronizeIterEpochCounts();
-    }
-
-
-    protected void synchronizeIterEpochCounts() {
-        //TODO: this is necessary for some schedules - but the redundant values are a little ugly...
-        int currIter = getIterationCount();
-        int currEpoch = getEpochCount();
-        for(Layer l : layers) {
-            l.setIterationCount(currIter);
-            l.setEpochCount(currEpoch);
-        }
-    }
-
-    /**
-     * Save the MultiLayerNetwork to a file. Restore using {@link #load(File, boolean)}.
-     * Note that this saves the updater (i.e., the state array for momentum/Adam/rmsprop etc), which is desirable
-     * if further training will be undertaken.
-     *
-     * @param f File to save the network to
-     * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
-     * @see #save(File, boolean)
-     */
-    public void save( File f ) throws IOException {
-        save(f, true);
-    }
-
-    /**
-     * Save the MultiLayerNetwork to a file. Restore using {@link #load(File, boolean)}.
-     *
-     * @param f File to save the network to
-     * @param saveUpdater If true: save the updater (i.e., the state array for momentum/Adam/rmsprop etc), which should
-     *                    usually be saved if further training is required
-     * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
-     * @see #save(File, boolean)
-     */
-    public void save(File f, boolean saveUpdater) throws IOException{
-        ModelSerializer.writeModel(this, f, saveUpdater);
-    }
-
-    /**
-     * Restore a MultiLayerNetwork to a file, saved using {@link #save(File)} or {@link ModelSerializer}
-     * @param f File to load the network from
-     * @param loadUpdater If true: load the updater if it is available (i.e., the state array for momentum/Adam/rmsprop
-     *                   etc) - use <i>false</i> if no further training is required, or <i>true</i> if further training
-     *                    will be undertaken
-     * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
-     */
-    public static MultiLayerNetwork load(File f, boolean loadUpdater) throws IOException {
-        return ModelSerializer.restoreMultiLayerNetwork(f, loadUpdater);
-    }
-
-    /**
-     * Convert this MultiLayerNetwork to a ComputationGraph
-     *
-     * @return ComputationGraph equivalent to this network (including parameters and updater state)
-     */
-    public ComputationGraph toComputationGraph(){
-        return NetworkUtils.toComputationGraph(this);
-    }
-
-    /**
-     * Return a copy of the network with the parameters and activations set to use the specified (floating point) data type.
-     * If the existing datatype is the same as the requested dataype, the original network will be returned unchanged.
-     * Only floating point datatypes (DOUBLE, FLOAT, HALF) may be used.
-     *
-     * @param dataType Datatype to convert the network to
-     * @return The network, set to use the specified datatype for the parameters and activations
-     */
-    public MultiLayerNetwork convertDataType(@NonNull DataType dataType){
-        Preconditions.checkState(dataType.isFPType(), "Invalid DataType: %s. Can only convert network to a floating point type", dataType);
-        if(dataType == params().dataType()){
-            return this;
-        }
-
-        try(MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
-            INDArray newParams = params().castTo(dataType);
-            String jsonConfig = getLayerWiseConfigurations().toJson();
-            MultiLayerConfiguration newConf = MultiLayerConfiguration.fromJson(jsonConfig);
-            newConf.setDataType(dataType);
-            MultiLayerNetwork newNet = new MultiLayerNetwork(newConf);
-            newNet.init(newParams, false);
-
-            Updater u = getUpdater(false);
-            if(u != null && u.getStateViewArray() != null){
-                INDArray oldUpdaterState = u.getStateViewArray();
-                newNet.getUpdater(true).getStateViewArray().assign(oldUpdaterState);
-            }
-            return newNet;
-        }
-    }
-
-    /**
-     * Set the learning rate for all layers in the network to the specified value. Note that if any learning rate
-     * schedules are currently present, these will be removed in favor of the new (fixed) learning rate.<br>
-     * <br>
-     * <b>Note</b>: <i>This method not free from a performance point of view</i>: a proper learning rate schedule
-     * should be used in preference to calling this method at every iteration.
-     *
-     * @param newLr New learning rate for all layers
-     * @see #setLearningRate(ISchedule)
-     * @see #setLearningRate(int, double)
-     */
-    public void setLearningRate(double newLr){
-        NetworkUtils.setLearningRate(this, newLr);
-    }
-
-    /**
-     * Set the learning rate schedule for all layers in the network to the specified schedule.
-     * This schedule will replace any/all existing schedules, and also any fixed learning rate values.<br>
-     * Note that the iteration/epoch counts will <i>not</i> be reset. Use {@link MultiLayerConfiguration#setIterationCount(int)}
-     * and {@link MultiLayerConfiguration#setEpochCount(int)} if this is required
-     *
-     * @param newLr New learning rate schedule for all layers
-     * @see #setLearningRate(ISchedule)
-     * @see #setLearningRate(int, double)
-     */
-    public void setLearningRate(ISchedule newLr){
-        NetworkUtils.setLearningRate(this, newLr);
-    }
-
-    /**
-     * Set the learning rate for a single layer in the network to the specified value. Note that if any learning rate
-     * schedules are currently present, these will be removed in favor of the new (fixed) learning rate.<br>
-     * <br>
-     * <b>Note</b>: <i>This method not free from a performance point of view</i>: a proper learning rate schedule
-     * should be used in preference to calling this method at every iteration. Note also that
-     * {@link #setLearningRate(double)} should also be used in preference, when all layers need to be set to a new LR
-     *
-     * @param layerNumber Number of the layer to set the LR for
-     * @param newLr New learning rate for a single layer
-     * @see #setLearningRate(ISchedule)
-     * @see #setLearningRate(int, double)
-     */
-    public void setLearningRate(int layerNumber, double newLr){
-        NetworkUtils.setLearningRate(this, layerNumber, newLr);
-    }
-
-    /**
-     * Set the learning rate schedule for a single layer in the network to the specified value.<br>
-     * Note also that {@link #setLearningRate(ISchedule)} should also be used in preference, when all layers need
-     * to be set to a new LR schedule.<br>
-     * This schedule will replace any/all existing schedules, and also any fixed learning rate values.<br>
-     * Note also that the iteration/epoch counts will <i>not</i> be reset. Use {@link MultiLayerConfiguration#setIterationCount(int)}
-     * and {@link MultiLayerConfiguration#setEpochCount(int)} if this is required
-     *
-     * @param layerNumber Number of the layer to set the LR schedule for
-     * @param newLr New learning rate for a single layer
-     * @see #setLearningRate(ISchedule)
-     * @see #setLearningRate(int, double)
-     */
-    public void setLearningRate(int layerNumber, ISchedule newLr){
-        NetworkUtils.setLearningRate(this, layerNumber, newLr);
-    }
-
-    /**
-     * Get the current learning rate, for the specified layer, from the network.
-     * Note: If the layer has no learning rate (no parameters, or an updater without a learning rate) then null is returned
-     * @param layerNumber   Layer number to get the learning rate for
-     * @return Learning rate for the specified layer, or null
-     */
-    public Double getLearningRate(int layerNumber){
-        return NetworkUtils.getLearningRate(this, layerNumber);
-    }
-
-    /**
-     * Return the layer size (number of units) for the specified layer.<br>
-     * Note that the meaning of the "layer size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, recurrent layers: number of units (nOut configuration option)<br>
-     * - ConvolutionLayer: the channels (number of channels)<br>
-     * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
-     *
-     * @param layer Index of the layer to get the size of. Must be in range 0 to nLayers-1 inclusive
-     * @return Size of the layer
-     */
-    public int layerSize(int layer) {
-        if (layer < 0 || layer > layers.length) {
-            throw new IllegalArgumentException("Invalid layer index: " + layer + ". Layer index must be between 0 and "
-                    + (layers.length - 1) + " inclusive");
-        }
-        org.deeplearning4j.nn.conf.layers.Layer conf = layers[layer].conf().getLayer();
-        if (conf == null || !(conf instanceof FeedForwardLayer)) {
-            return 0;
-        }
-        FeedForwardLayer ffl = (FeedForwardLayer) conf;
-
-        if (ffl.getNOut() > Integer.MAX_VALUE)
+          if (endTimeIdx > Integer.MAX_VALUE) {
             throw new ND4JArraySizeException();
-        return (int) ffl.getNOut();
-    }
+          }
+          INDArray[] subsets = getSubsetsForTbptt(startTimeIdx, (int) endTimeIdx, features, labels,
+              fMask, lMask);
 
-    /**
-     * Return the input size (number of inputs) for the specified layer.<br>
-     * Note that the meaning of the "input size" can depend on the type of layer. For example:<br>
-     * - DenseLayer, OutputLayer, etc: the feature vector size (nIn configuration option)<br>
-     * - Recurrent layers: the feature vector size <i>per time step</i> (nIn configuration option)<br>
-     * - ConvolutionLayer: the channels (number of channels)<br>
-     * - Subsampling layers, global pooling layers, etc: size of 0 is always returned<br>
-     *
-     * @param layer Index of the layer to get the size of. Must be in range 0 to nLayers-1 inclusive
-     * @return Size of the layer
-     */
-    public int layerInputSize(int layer) {
-        if (layer < 0 || layer > layers.length) {
-            throw new IllegalArgumentException("Invalid layer index: " + layer + ". Layer index must be between 0 and "
-                    + (layers.length - 1) + " inclusive");
+          setLayerMaskArrays(subsets[2], subsets[3]);
+
+          try (MemoryWorkspace ws = outputWs.notifyScopeEntered()) {
+            INDArray outSub = rnnTimeStep(subsets[0], ws);
+            try (MemoryWorkspace wsO = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
+              for (T evaluation : evaluations) {
+                evaluation.eval(subsets[1], outSub, subsets[3]);
+              }
+            }
+          }
         }
-        org.deeplearning4j.nn.conf.layers.Layer conf = layers[layer].conf().getLayer();
-        if (conf == null || !(conf instanceof FeedForwardLayer)) {
-            return 0;
+      }
+
+      //Clear inputs, masks etc. Important to avoid leaking invalidated/out of scope arrays between iterations
+      clearLayersStates();
+    }
+
+    if (iterator.asyncSupported()) {
+      ((AsyncDataSetIterator) iter).shutdown();
+    }
+
+    layerWiseConfigurations.setTrainingWorkspaceMode(cMode);
+
+    return evaluations;
+  }
+
+  /**
+   * Evaluate the network on the provided data set. Used for evaluating the performance of
+   * classifiers
+   *
+   * @param iterator Data to undertake evaluation on
+   * @return Evaluation object, summarizing the results of the evaluation on the provided
+   * DataSetIterator
+   */
+  public Evaluation evaluate(DataSetIterator iterator, List<String> labelsList) {
+    return evaluate(iterator, labelsList, 1);
+  }
+
+  @Override
+  public INDArray updaterState() {
+    return getUpdater() != null ? getUpdater().getStateViewArray() : null;
+  }
+
+  @Override
+  public void fit(MultiDataSet dataSet) {
+    if (dataSet.getFeatures().length == 1 && dataSet.getLabels().length == 1) {
+      INDArray features = dataSet.getFeatures(0);
+      INDArray labels = dataSet.getLabels(0);
+      INDArray fMask = null;
+      INDArray lMask = null;
+
+      if (dataSet.getFeaturesMaskArrays() != null) {
+        fMask = dataSet.getFeaturesMaskArrays()[0];
+      }
+
+      if (dataSet.getFeaturesMaskArrays() != null) {
+        lMask = dataSet.getLabelsMaskArrays()[0];
+      }
+
+      DataSet ds = new DataSet(features, labels, fMask, lMask);
+      fit(ds);
+    } else {
+      throw new DL4JInvalidInputException(
+          "MultiLayerNetwork can't handle MultiDataSet with more than 1 features or labels array." +
+              "Please consider use of ComputationGraph");
+    }
+  }
+
+  /**
+   * Perform minibatch training on all minibatches in the MultiDataSetIterator, for the specified
+   * number of epochs. Equvalent to calling {@link #fit(MultiDataSetIterator)} numEpochs times in a
+   * loop
+   *
+   * @param iterator  Training data (DataSetIterator). Iterator must support resetting
+   * @param numEpochs Number of training epochs, >= 1
+   */
+  public void fit(@NonNull MultiDataSetIterator iterator, int numEpochs) {
+    Preconditions.checkArgument(numEpochs > 0, "Number of epochs much be > 0. Got numEpochs = %s",
+        numEpochs);
+    Preconditions.checkArgument(numEpochs == 1 || iterator.resetSupported(),
+        "Cannot perform multiple epochs training using" +
+            "iterator has does not support resetting (iterator.resetSupported() returned false)");
+
+    for (int i = 0; i < numEpochs; i++) {
+      fit(iterator);
+    }
+  }
+
+  /**
+   * Perform minibatch training on all minibatches in the MultiDataSetIterator.<br> Note: The
+   * MultiDataSets in the MultiDataSetIterator must have exactly 1 input and output array (as
+   * MultiLayerNetwork only supports 1 input and 1 output)
+   *
+   * @param iterator Training data (DataSetIterator). Iterator must support resetting
+   */
+  @Override
+  public void fit(MultiDataSetIterator iterator) {
+    fit(new MultiDataSetWrapperIterator(iterator));
+  }
+
+  @Override
+  public <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T[] evaluations) {
+    return doEvaluation(new MultiDataSetWrapperIterator(iterator), evaluations);
+  }
+
+  /**
+   * Evaluate the network (for classification) on the provided data set, with top N accuracy in
+   * addition to standard accuracy. For 'standard' accuracy evaluation only, use topN = 1
+   *
+   * @param iterator   Iterator (data) to evaluate on
+   * @param labelsList List of labels. May be null.
+   * @param topN       N value for top N accuracy evaluation
+   * @return Evaluation object, summarizing the results of the evaluation on the provided
+   * DataSetIterator
+   */
+  public Evaluation evaluate(DataSetIterator iterator, List<String> labelsList, int topN) {
+    if (layers == null || !(getOutputLayer() instanceof IOutputLayer)) {
+      throw new IllegalStateException("Cannot evaluate network with no output layer");
+    }
+    if (labelsList == null) {
+      try {
+        labelsList = iterator.getLabels();
+      } catch (Throwable t) {
+      }    //Ignore, maybe UnsupportedOperationException etc
+    }
+
+    Layer outputLayer = getOutputLayer();
+    if (getLayerWiseConfigurations().isValidateOutputLayerConfig()) {
+      OutputLayerUtil.validateOutputLayerForClassifierEvaluation(outputLayer.conf().getLayer(),
+          Evaluation.class);
+    }
+
+    Evaluation e = new org.deeplearning4j.eval.Evaluation(labelsList, topN);
+    doEvaluation(iterator, e);
+
+    return e;
+  }
+
+  protected void update(Task task) {
+    if (!initDone) {
+      initDone = true;
+      Heartbeat heartbeat = Heartbeat.getInstance();
+      task = ModelSerializer.taskByModel(this);
+      Environment env = EnvironmentUtils.buildEnvironment();
+      heartbeat.reportEvent(Event.STANDALONE, env, task);
+    }
+  }
+
+  /**
+   * String detailing the architecture of the multilayernetwork. Columns are LayerIndex with layer
+   * type, nIn, nOut, Total number of parameters and the Shapes of the parameters Will also give
+   * information about frozen layers, if any.
+   *
+   * @return Summary as a string
+   * @see #memoryInfo(int, InputType)
+   */
+  public String summary() {
+    return summary(null);
+  }
+
+  /**
+   * String detailing the architecture of the multilayernetwork. Will also display activation size
+   * when given an input type. Columns are LayerIndex with layer type, nIn, nOut, Total number of
+   * parameters, Shapes of the parameters, Input activation shape, Output activation shape Will also
+   * give information about frozen layers, if any.
+   *
+   * @return Summary as a string
+   * @see #memoryInfo(int, InputType)
+   */
+  public String summary(InputType inputType) {
+    StringBuilder ret = new StringBuilder();
+    ret.append("\n");
+
+    List<String[]> lines = new ArrayList<>();
+    if (inputType == null) {
+      lines.add(new String[]{"LayerName (LayerType)", "nIn,nOut", "TotalParams", "ParamsShape"});
+    } else {
+      lines.add(new String[]{"LayerName (LayerType)", "nIn,nOut", "TotalParams", "ParamsShape",
+          "InputShape", "OutputShape"});
+    }
+    int[] maxLength = new int[inputType == null ? 4 : 6];
+    String[] header = lines.get(0);
+    for (int i = 0; i < header.length; i++) {
+      maxLength[i] = header[i].length();
+    }
+
+    int frozenParams = 0;
+    for (org.deeplearning4j.nn.api.Layer currentLayer : getLayers()) {
+      String name = currentLayer.conf().getLayer().getLayerName();
+      if (name == null) {
+        name = String.valueOf(currentLayer.getIndex());
+      }
+      String paramShape = "-";
+      String in = "-";
+      String out = "-";
+      String[] classNameArr = currentLayer.getClass().getName().split("\\.");
+      String className = classNameArr[classNameArr.length - 1];
+      String paramCount = String.format("%,d", currentLayer.numParams());
+      String inShape = "";
+      String outShape = "";
+      InputPreProcessor preProcessor;
+      InputType outType;
+      if (inputType != null) {
+        preProcessor = getLayerWiseConfigurations().getInputPreProcess(currentLayer.getIndex());
+        inShape = inputType.toString();
+        if (preProcessor != null) {
+          inputType = preProcessor.getOutputType(inputType);
+          inShape += "--> " + inputType.toString();
         }
-        FeedForwardLayer ffl = (FeedForwardLayer) conf;
-
-        if (ffl.getNIn() > Integer.MAX_VALUE)
-            throw new ND4JArraySizeException();
-        return (int) ffl.getNIn();
-    }
-
-    /**
-     * Indicates whether some other object is "equal to" this one.
-     * <p>
-     * The {@code equals} method implements an equivalence relation
-     * on non-null object references:
-     * <ul>
-     * <li>It is <i>reflexive</i>: for any non-null reference value
-     * {@code x}, {@code x.equals(x)} should return
-     * {@code true}.
-     * <li>It is <i>symmetric</i>: for any non-null reference values
-     * {@code x} and {@code y}, {@code x.equals(y)}
-     * should return {@code true} if and only if
-     * {@code y.equals(x)} returns {@code true}.
-     * <li>It is <i>transitive</i>: for any non-null reference values
-     * {@code x}, {@code y}, and {@code z}, if
-     * {@code x.equals(y)} returns {@code true} and
-     * {@code y.equals(z)} returns {@code true}, then
-     * {@code x.equals(z)} should return {@code true}.
-     * <li>It is <i>consistent</i>: for any non-null reference values
-     * {@code x} and {@code y}, multiple invocations of
-     * {@code x.equals(y)} consistently return {@code true}
-     * or consistently return {@code false}, provided no
-     * information used in {@code equals} comparisons on the
-     * objects is modified.
-     * <li>For any non-null reference value {@code x},
-     * {@code x.equals(null)} should return {@code false}.
-     * </ul>
-     * <p>
-     * The {@code equals} method for class {@code Object} implements
-     * the most discriminating possible equivalence relation on objects;
-     * that is, for any non-null reference values {@code x} and
-     * {@code y}, this method returns {@code true} if and only
-     * if {@code x} and {@code y} refer to the same object
-     * ({@code x == y} has the value {@code true}).
-     * <p>
-     * Note that it is generally necessary to override the {@code hashCode}
-     * method whenever this method is overridden, so as to maintain the
-     * general contract for the {@code hashCode} method, which states
-     * that equal objects must have equal hash codes.
-     *
-     * @param obj the reference object with which to compare.
-     * @return {@code true} if this object is the same as the obj
-     * argument; {@code false} otherwise.
-     * @see #hashCode()
-     * @see HashMap
-     */
-    @Override
-    public boolean equals(Object obj) {
-        if (obj == null)
-            return false;
-        if (obj instanceof MultiLayerNetwork) {
-            MultiLayerNetwork network = (MultiLayerNetwork) obj;
-            boolean paramsEquals = network.params().equals(params());
-            boolean confEquals = getLayerWiseConfigurations().equals(network.getLayerWiseConfigurations());
-            boolean updaterEquals = getUpdater().equals(network.getUpdater());
-            return paramsEquals && confEquals && updaterEquals;
+        outType = currentLayer.conf().getLayer().getOutputType(currentLayer.getIndex(), inputType);
+        outShape = outType.toString();
+        inputType = outType;
+      }
+      if (currentLayer.numParams() > 0) {
+        paramShape = "";
+        if (currentLayer instanceof BidirectionalLayer) { // Bidirectional layer is not an FFL
+          BidirectionalLayer bi = (BidirectionalLayer) currentLayer;
+          in = String.valueOf(((Bidirectional) bi.conf().getLayer()).getNIn());
+          out = String.valueOf(((Bidirectional) bi.conf().getLayer()).getNOut());
+        } else {
+          try {
+            in = String.valueOf(((FeedForwardLayer) currentLayer.conf().getLayer()).getNIn());
+            out = String.valueOf(((FeedForwardLayer) currentLayer.conf().getLayer()).getNOut());
+          } catch (
+              Exception e) { // Some layers, like PReLU, are just BaseLayers (but have parameters)
+          }
         }
-        return false;
-    }
-
-    private void writeObject(ObjectOutputStream oos) throws IOException {
-        ModelSerializer.writeModel(this, oos, true);
-    }
-
-    private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException {
-        val mln = ModelSerializer.restoreMultiLayerNetwork(ois, true);
-
-        this.defaultConfiguration = mln.defaultConfiguration.clone();
-        this.layerWiseConfigurations = mln.layerWiseConfigurations.clone();
-        this.init();
-        this.flattenedParams.assign(mln.flattenedParams);
-
-        int numWorkingMem = 2 * (layerWiseConfigurations.getConfs().size() + layerWiseConfigurations.getInputPreProcessors().size());
-        WS_LAYER_WORKING_MEM_CONFIG = getLayerWorkingMemWSConfig(numWorkingMem);
-        WS_LAYER_ACT_X_CONFIG = getLayerActivationWSConfig(layerWiseConfigurations.getConfs().size());
-
-        if (mln.getUpdater() != null && mln.getUpdater(false).getStateViewArray() != null)
-            this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
-    }
-
-    /**
-     * Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
-     * Note that the network should not be used again for any purpose after it has been closed
-     */
-    @Override
-    public void close(){
-        //Close the INDArray and dealloc
-        if(flattenedParams.closeable())
-            flattenedParams.close();
-
-        if(flattenedGradients != null && flattenedGradients.closeable())
-            flattenedGradients.close();
-
-        Updater u = getUpdater(false);
-        if(u != null && u.getStateViewArray() != null) {
-            INDArray state = u.getStateViewArray();
-            if(state.closeable())
-                state.close();
+        Set<String> paraNames = currentLayer.paramTable().keySet();
+        for (String aP : paraNames) {
+          String paramS = ArrayUtils.toString(currentLayer.paramTable().get(aP).shape());
+          paramShape += aP + ":" + paramS + ", ";
         }
+        paramShape = paramShape.subSequence(0, paramShape.lastIndexOf(",")).toString();
+      }
+      if (currentLayer instanceof FrozenLayer) {
+        frozenParams += currentLayer.numParams();
+        classNameArr = ((FrozenLayer) currentLayer).getInsideLayer().getClass().getName()
+            .split("\\.");
+        className = "Frozen " + classNameArr[classNameArr.length - 1];
+      }
 
-        Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
-        System.gc();
+      String[] line;
+      if (inputType == null) {
+        line = new String[]{name + " (" + className + ")", in + "," + out, paramCount, paramShape};
+      } else {
+        line = new String[]{name + " (" + className + ")", in + "," + out, paramCount, paramShape,
+            inShape, outShape};
+      }
+      for (int i = 0; i < line.length; i++) {
+        maxLength[i] = Math.max(maxLength[i], line[i] == null ? 0 : line[i].length());
+      }
+      lines.add(line);
     }
+
+    StringBuilder sbFormat = new StringBuilder();
+    int totalLength = 0;
+    int pos = 0;
+    for (int length : maxLength) {
+      int currLength;
+      if (pos++ == maxLength.length - 1) {
+        currLength = length;
+      } else {
+        currLength = length + 3;
+      }
+      sbFormat.append("%-").append(currLength).append("s");
+      totalLength += currLength;
+    }
+    sbFormat.append("\n");
+    String format = sbFormat.toString();
+
+    ret.append(StringUtils.repeat("=", totalLength))
+        .append("\n");
+
+    boolean first = true;
+    for (String[] line : lines) {
+      String formatted = String.format(format, (Object[]) line);
+      ret.append(formatted);
+      if (first) {
+        ret.append(StringUtils.repeat("=", totalLength)).append("\n");
+        first = false;
+      }
+    }
+
+    ret.append(StringUtils.repeat("-", totalLength));
+    ret.append(String.format("\n%30s %,d", "Total Parameters: ", params().length()));
+    ret.append(
+        String.format("\n%30s %,d", "Trainable Parameters: ", params().length() - frozenParams));
+    ret.append(String.format("\n%30s %,d", "Frozen Parameters: ", frozenParams));
+    ret.append("\n");
+    ret.append(StringUtils.repeat("=", totalLength));
+    ret.append("\n");
+    return ret.toString();
+  }
+
+  /**
+   * Generate information regarding memory use for the network, for the given input type and
+   * minibatch size. Note that when using workspaces or CuDNN, the network should be trained for
+   * some iterations so that the memory workspaces have time to initialize. Without this, the memory
+   * requirements during training may be underestimated.
+   * <p>
+   * Note also that this is the same information that is generated during an OOM crash when training
+   * or performing inference.
+   *
+   * @param minibatch Minibatch size to estimate memory for
+   * @param inputType Input type to the network
+   * @return A String with information about network memory use information
+   */
+  public String memoryInfo(int minibatch, InputType inputType) {
+    return CrashReportingUtil.generateMemoryStatus(this, minibatch, inputType);
+  }
+
+  /**
+   * This method just makes sure there's no state preserved within layers
+   */
+  public void clearLayersStates() {
+    for (Layer layer : layers) {
+      layer.clear();
+      layer.clearNoiseWeightParams();
+    }
+  }
+
+  /**
+   * Increment the epoch count (in the underlying {@link MultiLayerConfiguration} by 1). Note that
+   * this is done <i>automatically</i> when using iterator-based fitting methods, such as
+   * {@link #fit(DataSetIterator)}. However, when using non-iterator fit methods (DataSet,
+   * INDArray/INDArray etc), the network has no way to know when one epoch ends and another starts.
+   * In such situations, this method can be used to increment the epoch counter.<br> Note that the
+   * epoch counter is used for situations such as some learning rate schedules, and the like.
+   * <p>
+   * The current epoch count can be obtained using
+   * {@code MultiLayerConfiguration.getLayerwiseConfiguration().getEpochCount()}
+   */
+  public void incrementEpochCount() {
+    layerWiseConfigurations.setEpochCount(layerWiseConfigurations.getEpochCount() + 1);
+    synchronizeIterEpochCounts();
+  }
+
+  protected void synchronizeIterEpochCounts() {
+    //TODO: this is necessary for some schedules - but the redundant values are a little ugly...
+    int currIter = getIterationCount();
+    int currEpoch = getEpochCount();
+    for (Layer l : layers) {
+      l.setIterationCount(currIter);
+      l.setEpochCount(currEpoch);
+    }
+  }
+
+  /**
+   * Save the MultiLayerNetwork to a file. Restore using {@link #load(File, boolean)}. Note that
+   * this saves the updater (i.e., the state array for momentum/Adam/rmsprop etc), which is
+   * desirable if further training will be undertaken.
+   *
+   * @param f File to save the network to
+   * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
+   * @see #save(File, boolean)
+   */
+  public void save(File f) throws IOException {
+    save(f, true);
+  }
+
+  /**
+   * Save the MultiLayerNetwork to a file. Restore using {@link #load(File, boolean)}.
+   *
+   * @param f           File to save the network to
+   * @param saveUpdater If true: save the updater (i.e., the state array for momentum/Adam/rmsprop
+   *                    etc), which should usually be saved if further training is required
+   * @see ModelSerializer ModelSerializer for more details (and saving/loading via streams)
+   * @see #save(File, boolean)
+   */
+  public void save(File f, boolean saveUpdater) throws IOException {
+    ModelSerializer.writeModel(this, f, saveUpdater);
+  }
+
+  /**
+   * Convert this MultiLayerNetwork to a ComputationGraph
+   *
+   * @return ComputationGraph equivalent to this network (including parameters and updater state)
+   */
+  public ComputationGraph toComputationGraph() {
+    return NetworkUtils.toComputationGraph(this);
+  }
+
+  /**
+   * Return a copy of the network with the parameters and activations set to use the specified
+   * (floating point) data type. If the existing datatype is the same as the requested dataype, the
+   * original network will be returned unchanged. Only floating point datatypes (DOUBLE, FLOAT,
+   * HALF) may be used.
+   *
+   * @param dataType Datatype to convert the network to
+   * @return The network, set to use the specified datatype for the parameters and activations
+   */
+  public MultiLayerNetwork convertDataType(@NonNull DataType dataType) {
+    Preconditions.checkState(dataType.isFPType(),
+        "Invalid DataType: %s. Can only convert network to a floating point type", dataType);
+    if (dataType == params().dataType()) {
+      return this;
+    }
+
+    try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
+      INDArray newParams = params().castTo(dataType);
+      String jsonConfig = getLayerWiseConfigurations().toJson();
+      MultiLayerConfiguration newConf = MultiLayerConfiguration.fromJson(jsonConfig);
+      newConf.setDataType(dataType);
+      MultiLayerNetwork newNet = new MultiLayerNetwork(newConf);
+      newNet.init(newParams, false);
+
+      Updater u = getUpdater(false);
+      if (u != null && u.getStateViewArray() != null) {
+        INDArray oldUpdaterState = u.getStateViewArray();
+        newNet.getUpdater(true).getStateViewArray().assign(oldUpdaterState);
+      }
+      return newNet;
+    }
+  }
+
+  /**
+   * Set the learning rate for all layers in the network to the specified value. Note that if any
+   * learning rate schedules are currently present, these will be removed in favor of the new
+   * (fixed) learning rate.<br>
+   * <br>
+   * <b>Note</b>: <i>This method not free from a performance point of view</i>: a proper learning
+   * rate schedule
+   * should be used in preference to calling this method at every iteration.
+   *
+   * @param newLr New learning rate for all layers
+   * @see #setLearningRate(ISchedule)
+   * @see #setLearningRate(int, double)
+   */
+  public void setLearningRate(double newLr) {
+    NetworkUtils.setLearningRate(this, newLr);
+  }
+
+  /**
+   * Set the learning rate schedule for all layers in the network to the specified schedule. This
+   * schedule will replace any/all existing schedules, and also any fixed learning rate values.<br>
+   * Note that the iteration/epoch counts will <i>not</i> be reset. Use
+   * {@link MultiLayerConfiguration#setIterationCount(int)} and
+   * {@link MultiLayerConfiguration#setEpochCount(int)} if this is required
+   *
+   * @param newLr New learning rate schedule for all layers
+   * @see #setLearningRate(ISchedule)
+   * @see #setLearningRate(int, double)
+   */
+  public void setLearningRate(ISchedule newLr) {
+    NetworkUtils.setLearningRate(this, newLr);
+  }
+
+  /**
+   * Set the learning rate for a single layer in the network to the specified value. Note that if
+   * any learning rate schedules are currently present, these will be removed in favor of the new
+   * (fixed) learning rate.<br>
+   * <br>
+   * <b>Note</b>: <i>This method not free from a performance point of view</i>: a proper learning
+   * rate schedule
+   * should be used in preference to calling this method at every iteration. Note also that
+   * {@link #setLearningRate(double)} should also be used in preference, when all layers need to be
+   * set to a new LR
+   *
+   * @param layerNumber Number of the layer to set the LR for
+   * @param newLr       New learning rate for a single layer
+   * @see #setLearningRate(ISchedule)
+   * @see #setLearningRate(int, double)
+   */
+  public void setLearningRate(int layerNumber, double newLr) {
+    NetworkUtils.setLearningRate(this, layerNumber, newLr);
+  }
+
+  /**
+   * Set the learning rate schedule for a single layer in the network to the specified value.<br>
+   * Note also that {@link #setLearningRate(ISchedule)} should also be used in preference, when all
+   * layers need to be set to a new LR schedule.<br> This schedule will replace any/all existing
+   * schedules, and also any fixed learning rate values.<br> Note also that the iteration/epoch
+   * counts will <i>not</i> be reset. Use {@link MultiLayerConfiguration#setIterationCount(int)} and
+   * {@link MultiLayerConfiguration#setEpochCount(int)} if this is required
+   *
+   * @param layerNumber Number of the layer to set the LR schedule for
+   * @param newLr       New learning rate for a single layer
+   * @see #setLearningRate(ISchedule)
+   * @see #setLearningRate(int, double)
+   */
+  public void setLearningRate(int layerNumber, ISchedule newLr) {
+    NetworkUtils.setLearningRate(this, layerNumber, newLr);
+  }
+
+  /**
+   * Get the current learning rate, for the specified layer, from the network. Note: If the layer
+   * has no learning rate (no parameters, or an updater without a learning rate) then null is
+   * returned
+   *
+   * @param layerNumber Layer number to get the learning rate for
+   * @return Learning rate for the specified layer, or null
+   */
+  public Double getLearningRate(int layerNumber) {
+    return NetworkUtils.getLearningRate(this, layerNumber);
+  }
+
+  /**
+   * Return the layer size (number of units) for the specified layer.<br> Note that the meaning of
+   * the "layer size" can depend on the type of layer. For example:<br> - DenseLayer, OutputLayer,
+   * recurrent layers: number of units (nOut configuration option)<br> - ConvolutionLayer: the
+   * channels (number of channels)<br> - Subsampling layers, global pooling layers, etc: size of 0
+   * is always returned<br>
+   *
+   * @param layer Index of the layer to get the size of. Must be in range 0 to nLayers-1 inclusive
+   * @return Size of the layer
+   */
+  public int layerSize(int layer) {
+    if (layer < 0 || layer > layers.length) {
+      throw new IllegalArgumentException(
+          "Invalid layer index: " + layer + ". Layer index must be between 0 and "
+              + (layers.length - 1) + " inclusive");
+    }
+    org.deeplearning4j.nn.conf.layers.Layer conf = layers[layer].conf().getLayer();
+    if (conf == null || !(conf instanceof FeedForwardLayer)) {
+      return 0;
+    }
+    FeedForwardLayer ffl = (FeedForwardLayer) conf;
+
+    if (ffl.getNOut() > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    return (int) ffl.getNOut();
+  }
+
+  /**
+   * Return the input size (number of inputs) for the specified layer.<br> Note that the meaning of
+   * the "input size" can depend on the type of layer. For example:<br> - DenseLayer, OutputLayer,
+   * etc: the feature vector size (nIn configuration option)<br> - Recurrent layers: the feature
+   * vector size <i>per time step</i> (nIn configuration option)<br> - ConvolutionLayer: the
+   * channels (number of channels)<br> - Subsampling layers, global pooling layers, etc: size of 0
+   * is always returned<br>
+   *
+   * @param layer Index of the layer to get the size of. Must be in range 0 to nLayers-1 inclusive
+   * @return Size of the layer
+   */
+  public int layerInputSize(int layer) {
+    if (layer < 0 || layer > layers.length) {
+      throw new IllegalArgumentException(
+          "Invalid layer index: " + layer + ". Layer index must be between 0 and "
+              + (layers.length - 1) + " inclusive");
+    }
+    org.deeplearning4j.nn.conf.layers.Layer conf = layers[layer].conf().getLayer();
+    if (conf == null || !(conf instanceof FeedForwardLayer)) {
+      return 0;
+    }
+    FeedForwardLayer ffl = (FeedForwardLayer) conf;
+
+    if (ffl.getNIn() > Integer.MAX_VALUE) {
+      throw new ND4JArraySizeException();
+    }
+    return (int) ffl.getNIn();
+  }
+
+  /**
+   * Indicates whether some other object is "equal to" this one.
+   * <p>
+   * The {@code equals} method implements an equivalence relation on non-null object references:
+   * <ul>
+   * <li>It is <i>reflexive</i>: for any non-null reference value
+   * {@code x}, {@code x.equals(x)} should return
+   * {@code true}.
+   * <li>It is <i>symmetric</i>: for any non-null reference values
+   * {@code x} and {@code y}, {@code x.equals(y)}
+   * should return {@code true} if and only if
+   * {@code y.equals(x)} returns {@code true}.
+   * <li>It is <i>transitive</i>: for any non-null reference values
+   * {@code x}, {@code y}, and {@code z}, if
+   * {@code x.equals(y)} returns {@code true} and
+   * {@code y.equals(z)} returns {@code true}, then
+   * {@code x.equals(z)} should return {@code true}.
+   * <li>It is <i>consistent</i>: for any non-null reference values
+   * {@code x} and {@code y}, multiple invocations of
+   * {@code x.equals(y)} consistently return {@code true}
+   * or consistently return {@code false}, provided no
+   * information used in {@code equals} comparisons on the
+   * objects is modified.
+   * <li>For any non-null reference value {@code x},
+   * {@code x.equals(null)} should return {@code false}.
+   * </ul>
+   * <p>
+   * The {@code equals} method for class {@code Object} implements
+   * the most discriminating possible equivalence relation on objects;
+   * that is, for any non-null reference values {@code x} and
+   * {@code y}, this method returns {@code true} if and only
+   * if {@code x} and {@code y} refer to the same object
+   * ({@code x == y} has the value {@code true}).
+   * <p>
+   * Note that it is generally necessary to override the {@code hashCode}
+   * method whenever this method is overridden, so as to maintain the
+   * general contract for the {@code hashCode} method, which states
+   * that equal objects must have equal hash codes.
+   *
+   * @param obj the reference object with which to compare.
+   * @return {@code true} if this object is the same as the obj argument; {@code false} otherwise.
+   * @see #hashCode()
+   * @see HashMap
+   */
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null) {
+      return false;
+    }
+    if (obj instanceof MultiLayerNetwork) {
+      MultiLayerNetwork network = (MultiLayerNetwork) obj;
+      boolean paramsEquals = network.params().equals(params());
+      boolean confEquals = getLayerWiseConfigurations().equals(
+          network.getLayerWiseConfigurations());
+      boolean updaterEquals = getUpdater().equals(network.getUpdater());
+      return paramsEquals && confEquals && updaterEquals;
+    }
+    return false;
+  }
+
+  private void writeObject(ObjectOutputStream oos) throws IOException {
+    ModelSerializer.writeModel(this, oos, true);
+  }
+
+  private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException {
+    val mln = ModelSerializer.restoreMultiLayerNetwork(ois, true);
+
+    this.defaultConfiguration = mln.defaultConfiguration.clone();
+    this.layerWiseConfigurations = mln.layerWiseConfigurations.clone();
+    this.init();
+    this.flattenedParams.assign(mln.flattenedParams);
+
+    int numWorkingMem = 2 * (layerWiseConfigurations.getConfs().size()
+        + layerWiseConfigurations.getInputPreProcessors().size());
+    WS_LAYER_WORKING_MEM_CONFIG = getLayerWorkingMemWSConfig(numWorkingMem);
+    WS_LAYER_ACT_X_CONFIG = getLayerActivationWSConfig(layerWiseConfigurations.getConfs().size());
+
+    if (mln.getUpdater() != null && mln.getUpdater(false).getStateViewArray() != null) {
+      this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
+    }
+  }
+
+  /**
+   * Close the network and deallocate all native memory, including: parameters, gradients, updater
+   * memory and workspaces Note that the network should not be used again for any purpose after it
+   * has been closed
+   */
+  @Override
+  public void close() {
+    //Close the INDArray and dealloc
+    if (flattenedParams.closeable()) {
+      flattenedParams.close();
+    }
+
+    if (flattenedGradients != null && flattenedGradients.closeable()) {
+      flattenedGradients.close();
+    }
+
+    Updater u = getUpdater(false);
+    if (u != null && u.getStateViewArray() != null) {
+      INDArray state = u.getStateViewArray();
+      if (state.closeable()) {
+        state.close();
+      }
+    }
+
+    Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
+    System.gc();
+  }
 }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearning.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearning.java
index 52ae7c891..b941cf636 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearning.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearning.java
@@ -572,7 +572,7 @@ public class TransferLearning {
          */
         public GraphBuilder(ComputationGraph origGraph) {
             this.origGraph = origGraph;
-            this.origConfig = origGraph.getConfiguration().clone();
+            this.origConfig = origGraph.getComputationGraphConfiguration().clone();
         }
 
         /**
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelper.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelper.java
index f6f3a35c1..a6f7d6c4f 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelper.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/transferlearning/TransferLearningHelper.java
@@ -242,7 +242,7 @@ public class TransferLearningHelper {
         }
 
         Set<String> frozenInputVerticesSorted = new HashSet<>();
-        frozenInputVerticesSorted.addAll(origGraph.getConfiguration().getNetworkInputs());
+        frozenInputVerticesSorted.addAll(origGraph.getComputationGraphConfiguration().getNetworkInputs());
         frozenInputVerticesSorted.removeAll(allFrozen);
         //remove input vertices - just to add back in a predictable order
         for (String existingInput : frozenInputVerticesSorted) {
@@ -328,7 +328,7 @@ public class TransferLearningHelper {
             String anInput = graphInputs.get(i);
             if (origGraph.getVertex(anInput).isInputVertex()) {
                 //was an original input to the graph
-                int inputIndex = origGraph.getConfiguration().getNetworkInputs().indexOf(anInput);
+                int inputIndex = origGraph.getComputationGraphConfiguration().getNetworkInputs().indexOf(anInput);
                 featuresNow[i] = origGraph.getInput(inputIndex);
             } else {
                 //needs to be grabbed from the internal activations
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/updater/BaseMultiLayerUpdater.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/updater/BaseMultiLayerUpdater.java
index 4f4d1690f..91d24de46 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/updater/BaseMultiLayerUpdater.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/updater/BaseMultiLayerUpdater.java
@@ -385,14 +385,14 @@ public abstract class BaseMultiLayerUpdater<T extends Model> implements Updater
     /**
      * Pre-apply: Apply gradient normalization/clipping
      *
-     * @param layer     Layer to apply gradient normalization/clipping for
+     * @param layer     ILayer to apply gradient normalization/clipping for
      * @param gradient  Gradient to update
      * @param iteration The current iteration (i.e., number of parameter updates so far)
      */
     public void preApply(Trainable layer, Gradient gradient, int iteration) {
 
         if (layer.getConfig() == null || layer.numParams() == 0) {
-            //Layer does not have parameters -> no gradient
+            //ILayer does not have parameters -> no gradient
             return;
         }
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/api/TrainingListener.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/api/TrainingListener.java
index 7c96fd750..81a2d8465 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/api/TrainingListener.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/api/TrainingListener.java
@@ -54,7 +54,7 @@ public interface TrainingListener {
      * only at training time
      *
      * @param model       Model
-     * @param activations Layer activations (including input)
+     * @param activations ILayer activations (including input)
      */
     void onForwardPass(Model model, List<INDArray> activations);
 
@@ -63,7 +63,7 @@ public interface TrainingListener {
      * only at training time
      *
      * @param model       Model
-     * @param activations Layer activations (including input)
+     * @param activations ILayer activations (including input)
      */
     void onForwardPass(Model model, Map<String, INDArray> activations);
 
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/listeners/CheckpointListener.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/listeners/CheckpointListener.java
index 4ebf2e050..550e4425b 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/listeners/CheckpointListener.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/listeners/CheckpointListener.java
@@ -247,7 +247,7 @@ public class CheckpointListener extends BaseTrainingListener implements Serializ
         if (model instanceof MultiLayerNetwork) {
             return ((MultiLayerNetwork) model).getLayerWiseConfigurations().getIterationCount();
         } else if (model instanceof ComputationGraph) {
-            return ((ComputationGraph) model).getConfiguration().getIterationCount();
+            return ((ComputationGraph) model).getComputationGraphConfiguration().getIterationCount();
         } else {
             return model.conf().getIterationCount();
         }
@@ -257,7 +257,7 @@ public class CheckpointListener extends BaseTrainingListener implements Serializ
         if (model instanceof MultiLayerNetwork) {
             return ((MultiLayerNetwork) model).getLayerWiseConfigurations().getEpochCount();
         } else if (model instanceof ComputationGraph) {
-            return ((ComputationGraph) model).getConfiguration().getEpochCount();
+            return ((ComputationGraph) model).getComputationGraphConfiguration().getEpochCount();
         } else {
             return model.conf().getEpochCount();
         }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/solvers/BaseOptimizer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/solvers/BaseOptimizer.java
index 3a8bfee10..42ce490e5 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/solvers/BaseOptimizer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/optimize/solvers/BaseOptimizer.java
@@ -336,7 +336,7 @@ public abstract class BaseOptimizer implements ConvexOptimizer {
         if (model instanceof MultiLayerNetwork) {
             return ((MultiLayerNetwork) model).getLayerWiseConfigurations().getIterationCount();
         } else if (model instanceof ComputationGraph) {
-            return ((ComputationGraph) model).getConfiguration().getIterationCount();
+            return ((ComputationGraph) model).getComputationGraphConfiguration().getIterationCount();
         } else {
             return model.conf().getIterationCount();
         }
@@ -347,7 +347,7 @@ public abstract class BaseOptimizer implements ConvexOptimizer {
             MultiLayerConfiguration conf = ((MultiLayerNetwork) model).getLayerWiseConfigurations();
             conf.setIterationCount(conf.getIterationCount() + incrementBy);
         } else if (model instanceof ComputationGraph) {
-            ComputationGraphConfiguration conf = ((ComputationGraph) model).getConfiguration();
+            ComputationGraphConfiguration conf = ((ComputationGraph) model).getComputationGraphConfiguration();
             conf.setIterationCount(conf.getIterationCount() + incrementBy);
         } else {
             model.conf().setIterationCount(model.conf().getIterationCount() + incrementBy);
@@ -358,7 +358,7 @@ public abstract class BaseOptimizer implements ConvexOptimizer {
         if (model instanceof MultiLayerNetwork) {
             return ((MultiLayerNetwork) model).getLayerWiseConfigurations().getEpochCount();
         } else if (model instanceof ComputationGraph) {
-            return ((ComputationGraph) model).getConfiguration().getEpochCount();
+            return ((ComputationGraph) model).getComputationGraphConfiguration().getEpochCount();
         } else {
             return model.conf().getEpochCount();
         }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/Convolution1DUtils.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/Convolution1DUtils.java
index 32c40bdfc..53bed93a2 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/Convolution1DUtils.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/Convolution1DUtils.java
@@ -79,7 +79,7 @@ public class Convolution1DUtils {
      * @return the format for the layer
      */
     public static RNNFormat getRnnFormatFromLayer(Layer layer) {
-        Preconditions.checkState(hasRnnDataFormat(layer),"Layer of type " + layer.getClass().getName() + " and name " + layer.getLayerName() + " does not have an RNNFormat");
+        Preconditions.checkState(hasRnnDataFormat(layer),"ILayer of type " + layer.getClass().getName() + " and name " + layer.getLayerName() + " does not have an RNNFormat");
         if(layer instanceof SimpleRnn) {
             SimpleRnn simpleRnn = (SimpleRnn) layer;
             return simpleRnn.getRnnDataFormat();
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/CrashReportingUtil.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/CrashReportingUtil.java
index ac28ced80..5227ad77f 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/CrashReportingUtil.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/CrashReportingUtil.java
@@ -320,12 +320,12 @@ public class CrashReportingUtil {
             appendHelperInformation(sb, mln.getLayers());
             appendActivationShapes(mln, (inputTypes == null || inputTypes.length == 0 ? null : inputTypes[0]), minibatch, sb, bytesPerElement);
         } else {
-            sb.append(f("Backprop Type", cg.getConfiguration().getBackpropType()));
-            if(cg.getConfiguration().getBackpropType() == BackpropType.TruncatedBPTT){
-                sb.append(f("TBPTT Length", cg.getConfiguration().getTbpttFwdLength() + "/" + cg.getConfiguration().getTbpttBackLength()));
+            sb.append(f("Backprop Type", cg.getComputationGraphConfiguration().getBackpropType()));
+            if(cg.getComputationGraphConfiguration().getBackpropType() == BackpropType.TruncatedBPTT){
+                sb.append(f("TBPTT Length", cg.getComputationGraphConfiguration().getTbpttFwdLength() + "/" + cg.getComputationGraphConfiguration().getTbpttBackLength()));
             }
-            sb.append(f("Workspace Mode: Training", cg.getConfiguration().getTrainingWorkspaceMode()));
-            sb.append(f("Workspace Mode: Inference", cg.getConfiguration().getInferenceWorkspaceMode()));
+            sb.append(f("Workspace Mode: Training", cg.getComputationGraphConfiguration().getTrainingWorkspaceMode()));
+            sb.append(f("Workspace Mode: Inference", cg.getComputationGraphConfiguration().getInferenceWorkspaceMode()));
             appendLayerInformation(sb, cg.getLayers(), bytesPerElement);
             appendHelperInformation(sb, cg.getLayers());
             appendActivationShapes(cg, sb, bytesPerElement);
@@ -461,13 +461,13 @@ public class CrashReportingUtil {
         List<String> l = new ArrayList<>(layerClasses.keySet());
         Collections.sort(l);
         sb.append(f("Number of Layers", layers.length));
-        sb.append("Layer Counts\n");
+        sb.append("ILayer Counts\n");
         for(String s : l){
             sb.append("  ").append(f(s, layerClasses.get(s)));
         }
-        sb.append("Layer Parameter Breakdown\n");
+        sb.append("ILayer Parameter Breakdown\n");
         String format = "  %-3s %-20s %-20s %-20s %-20s";
-        sb.append(String.format(format, "Idx", "Name", "Layer Type", "Layer # Parameters", "Layer Parameter Memory")).append("\n");
+        sb.append(String.format(format, "Idx", "Name", "ILayer Type", "ILayer # Parameters", "ILayer Parameter Memory")).append("\n");
         for(Layer layer : layers){
             long numParams = layer.numParams();
             sb.append(String.format(format, layer.getIndex(), layer.conf().getLayer().getLayerName(),
@@ -477,13 +477,13 @@ public class CrashReportingUtil {
     }
 
     private static void appendHelperInformation(StringBuilder sb, org.deeplearning4j.nn.api.Layer[] layers){
-        sb.append("\n----- Layer Helpers - Memory Use -----\n");
+        sb.append("\n----- ILayer Helpers - Memory Use -----\n");
 
         int helperCount = 0;
         long helperWithMemCount = 0L;
         long totalHelperMem = 0L;
 
-        //Layer index, layer name, layer class, helper class, total memory, breakdown
+        //ILayer index, layer name, layer class, helper class, total memory, breakdown
         String format = "%-3s %-20s %-25s %-30s %-12s %s";
         boolean header = false;
         for(Layer l : layers){
@@ -509,7 +509,7 @@ public class CrashReportingUtil {
 
 
             if(!header){
-                sb.append(String.format(format, "#", "Layer Name", "Layer Class", "Helper Class", "Total Memory", "Memory Breakdown"))
+                sb.append(String.format(format, "#", "ILayer Name", "ILayer Class", "Helper Class", "Total Memory", "Memory Breakdown"))
                         .append("\n");
                 header = true;
             }
@@ -551,7 +551,7 @@ public class CrashReportingUtil {
         sb.append(f("Input Shape", Arrays.toString(inputShape)));
         List<InputType> inputTypes = net.getLayerWiseConfigurations().getLayerActivationTypes(inputType);
         String format = "%-3s %-20s %-20s %-42s %-20s %-12s %-12s";
-        sb.append(String.format(format, "Idx", "Name", "Layer Type", "Activations Type", "Activations Shape",
+        sb.append(String.format(format, "Idx", "Name", "ILayer Type", "Activations Type", "Activations Shape",
                 "# Elements", "Memory")).append("\n");
         org.deeplearning4j.nn.api.Layer[] layers = net.getLayers();
         long totalActivationBytes = 0;
@@ -598,11 +598,11 @@ public class CrashReportingUtil {
         for( int i=0; i<input.length; i++ ) {
             sb.append(f("Current Input Shape (Input " + i + ")", Arrays.toString(input[i].shape())));
         }
-        Map<String,InputType> inputTypes = net.getConfiguration().getLayerActivationTypes(inputType);
+        Map<String,InputType> inputTypes = net.getComputationGraphConfiguration().getLayerActivationTypes(inputType);
         GraphIndices indices = net.calculateIndices();
 
         String format = "%-3s %-20s %-20s %-42s %-20s %-12s %-12s";
-        sb.append(String.format(format, "Idx", "Name", "Layer Type", "Activations Type", "Activations Shape",
+        sb.append(String.format(format, "Idx", "Name", "ILayer Type", "Activations Type", "Activations Shape",
                 "# Elements", "Memory")).append("\n");
         org.deeplearning4j.nn.api.Layer[] layers = net.getLayers();
         long totalActivationBytes = 0;
@@ -633,7 +633,7 @@ public class CrashReportingUtil {
             sb.append(String.format(format, i, layerName, className, it,
                     Arrays.toString(shape), (numElements < 0 ? "<variable>" : String.valueOf(numElements)), fBytes(bytes))).append("\n");
 
-            if(!net.getConfiguration().getNetworkOutputs().contains(layerName)){
+            if(!net.getComputationGraphConfiguration().getNetworkOutputs().contains(layerName)){
                 totalExOutput += bytes;
             }
         }
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/ModelSerializer.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/ModelSerializer.java
index ae7e2e2df..e636334fd 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/ModelSerializer.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/ModelSerializer.java
@@ -141,7 +141,7 @@ public class ModelSerializer {
         if (model instanceof MultiLayerNetwork) {
             json = ((MultiLayerNetwork) model).getLayerWiseConfigurations().toJson();
         } else if (model instanceof ComputationGraph) {
-            json = ((ComputationGraph) model).getConfiguration().toJson();
+            json = ((ComputationGraph) model).getComputationGraphConfiguration().toJson();
         }
 
         ZipEntry config = new ZipEntry(CONFIGURATION_JSON);
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/NetworkUtils.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/NetworkUtils.java
index 7ed0a4bcb..4348be74a 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/NetworkUtils.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/NetworkUtils.java
@@ -199,7 +199,7 @@ public class NetworkUtils {
      * Note: If the layer has no learning rate (no parameters, or an updater without a learning rate) then null is returned
      *
      * @param net         Network
-     * @param layerNumber Layer number to get the learning rate for
+     * @param layerNumber ILayer number to get the learning rate for
      * @return Learning rate for the specified layer, or null
      */
     public static Double getLearningRate(MultiLayerNetwork net, int layerNumber) {
@@ -321,13 +321,13 @@ public class NetworkUtils {
      * Note: If the layer has no learning rate (no parameters, or an updater without a learning rate) then null is returned
      *
      * @param net       Network
-     * @param layerName Layer name to get the learning rate for
+     * @param layerName ILayer name to get the learning rate for
      * @return Learning rate for the specified layer, or null
      */
     public static Double getLearningRate(ComputationGraph net, String layerName) {
         Layer l = net.getLayer(layerName).conf().getLayer();
-        int iter = net.getConfiguration().getIterationCount();
-        int epoch = net.getConfiguration().getEpochCount();
+        int iter = net.getComputationGraphConfiguration().getIterationCount();
+        int epoch = net.getComputationGraphConfiguration().getEpochCount();
         if (l instanceof BaseLayer) {
             BaseLayer bl = (BaseLayer) l;
             IUpdater u = bl.getIUpdater();
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/OutputLayerUtil.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/OutputLayerUtil.java
index 08a3d086a..fb3d9ea64 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/OutputLayerUtil.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/OutputLayerUtil.java
@@ -68,7 +68,7 @@ public class OutputLayerUtil {
      *
      * If the specified layer is not an output layer, this is a no-op
      * @param layerName Name of the layer
-     * @param layer         Layer
+     * @param layer         ILayer
      */
     public static void validateOutputLayer(String layerName, Layer layer){
         IActivation activation;
diff --git a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java
index df4583cd8..eb5814b49 100644
--- a/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java
+++ b/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java
@@ -440,7 +440,7 @@ public class TimeSeriesUtils {
     /**
      * Get the {@link RNNFormat} from the RNN layer, accounting for the presence of wrapper layers like Bidirectional,
      * LastTimeStep, etc
-     * @param layer Layer to get the RNNFormat from
+     * @param layer ILayer to get the RNNFormat from
      */
     public static RNNFormat getFormatFromRnnLayer(Layer layer){
         if(layer instanceof BaseRecurrentLayer){
diff --git a/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/api/dnnTest.java b/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/api/dnnTest.java
new file mode 100644
index 000000000..06c322a57
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/api/dnnTest.java
@@ -0,0 +1,127 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.api;
+
+import static net.brutex.ai.dnn.api.dnn.*;
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.util.Iterator;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.lang3.RandomUtils;
+import org.deeplearning4j.datasets.iterator.FloatsDataSetIterator;
+import org.deeplearning4j.nn.conf.CacheMode;
+import org.deeplearning4j.nn.conf.GradientNormalization;
+import org.deeplearning4j.nn.conf.Updater;
+import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.conf.layers.ActivationLayer;
+import org.deeplearning4j.nn.weights.WeightInitXavier;
+import org.junit.jupiter.api.Test;
+import org.nd4j.common.primitives.Pair;
+import org.nd4j.linalg.activations.impl.ActivationIdentity;
+import org.nd4j.linalg.activations.impl.ActivationLReLU;
+import org.nd4j.linalg.learning.config.Adam;
+
+
+class dnnTest {
+
+  @Test
+  void testFFLayer() {
+    int numFeatures = 128;
+    int batchSize = 10;
+    int numRows = 1000;
+    AtomicInteger cnt = new AtomicInteger(0);
+    FloatsDataSetIterator iterator = new FloatsDataSetIterator(floatIterable(numRows, numFeatures), batchSize);
+
+    assertTrue(iterator.hasNext());
+
+    /**
+     *     MultiLayerConfiguration confxx = new NeuralNetConfiguration.Builder()
+     *         .seed(42)
+     *         .updater(UPDATER)
+     *         .gradientNormalization(GradientNormalization.RenormalizeL2PerLayer)
+     *         .gradientNormalizationThreshold(GRADIENT_THRESHOLD)
+     *         .weightInit(WeightInit.XAVIER)
+     *         .activation(Activation.IDENTITY)
+     *         .list(genLayers())
+     *         .setInputType(InputType.convolutional(X_DIM, Y_DIM, CHANNELS))
+     *        // .inputPreProcessor("CNN1", new FeedForwardToCnnPreProcessor(Y_DIM, X_DIM, CHANNELS))
+     *         .build();
+     */
+
+    /**
+     *         new DenseLayer.Builder().nIn(INPUT).nOut(X_DIM*Y_DIM*CHANNELS).weightInit(WeightInit.NORMAL).build(),
+     *         new ActivationLayer.Builder(new ActivationLReLU(0.2)).build(),
+     *         new DenseLayer.Builder().nIn(X_DIM*Y_DIM*CHANNELS).nOut(X_DIM*Y_DIM).build(),
+     *         new ActivationLayer.Builder(new ActivationLReLU(0.2)).build(),
+     *         new DenseLayer.Builder().nIn(X_DIM*Y_DIM).nOut(X_DIM*Y_DIM).build(),
+     *         new ActivationLayer.Builder(new ActivationLReLU(0.2)).build(),
+     *         new DenseLayer.Builder().nIn(X_DIM*Y_DIM).nOut(X_DIM*Y_DIM*CHANNELS).activation(Activation.TANH)
+     */
+    dnn.conf()
+        .seed(42)
+        .updater( Adam.builder().learningRate(0.0002).beta1(0.5).build() )
+        .gradientNormalization( GradientNormalization.RenormalizeL2PerLayer)
+        .gradientNormalizationThreshold( 100 )
+        .weightInit( new WeightInitXavier() )
+        .activation( new ActivationIdentity() )
+        .inputType( InputType.convolutional( 28, 28, 1))
+        .layer( dnn.DenseLayer(10,30).build() )
+        .layer(new ActivationLayer.Builder(new ActivationLReLU(0.2)).build() )
+
+    ;
+
+
+  }
+
+  protected static Iterable<Pair<float[], float[]>> floatIterable(final int totalRows, final int numColumns) {
+    return new Iterable<Pair<float[], float[]>>() {
+      @Override
+      public Iterator<Pair<float[], float[]>> iterator() {
+        return new Iterator<Pair<float[], float[]>>() {
+          private final AtomicInteger cnt = new AtomicInteger(0);
+
+          @Override
+          public boolean hasNext() {
+            return cnt.incrementAndGet() <= totalRows;
+          }
+
+          @Override
+          public Pair<float[], float[]> next() {
+            float[] features = new float[numColumns];
+            float[] labels = new float[numColumns];
+            for (int i = 0; i < numColumns; i++) {
+              features[i] = (float) i;
+              labels[i] = RandomUtils.nextFloat(0, 5);
+            }
+            return Pair.makePair(features, labels);
+          }
+
+          @Override
+          public void remove() {
+            // no-op
+          }
+        };
+      }
+    };
+  }
+
+}
\ No newline at end of file
diff --git a/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/conf/layer/FFLayerTest.java b/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/conf/layer/FFLayerTest.java
new file mode 100644
index 000000000..2fa944000
--- /dev/null
+++ b/cavis-dnn/cavis-dnn-nn/src/test/java/net/brutex/ai/dnn/conf/layer/FFLayerTest.java
@@ -0,0 +1,47 @@
+/*
+ *
+ *    ******************************************************************************
+ *    *
+ *    * This program and the accompanying materials are made available under the
+ *    * terms of the Apache License, Version 2.0 which is available at
+ *    * https://www.apache.org/licenses/LICENSE-2.0.
+ *    *
+ *    *  See the NOTICE file distributed with this work for additional
+ *    *  information regarding copyright ownership.
+ *    * Unless required by applicable law or agreed to in writing, software
+ *    * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *    * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ *    * License for the specific language governing permissions and limitations
+ *    * under the License.
+ *    *
+ *    * SPDX-License-Identifier: Apache-2.0
+ *    *****************************************************************************
+ *
+ */
+
+package net.brutex.ai.dnn.conf.layer;
+
+import net.brutex.ai.dnn.api.IModel;
+import net.brutex.ai.dnn.api.INeuralNetworkConfiguration;
+import net.brutex.ai.dnn.api.ILayerConfiguration;
+import org.junit.jupiter.api.Test;
+
+class FFLayerTest {
+
+  @Test
+  void instantiate() {
+    ILayerConfiguration ff_conf = FeedForwardLayerConfiguration.builder().build();
+    INeuralNetworkConfiguration net_conf = net.brutex.ai.dnn.conf.NeuralNetworkConfiguration.builder()
+        .layerConfiguration(ff_conf)
+        .build();
+    IModel network = net.brutex.ai.dnn.impl.network.NeuralNetwork.builder().name("Test Network")
+        .configuration(net_conf)
+        .build();
+    ff_conf.instantiate(network);
+
+  }
+
+  @Test
+  void getOutputType() {
+  }
+}
\ No newline at end of file
diff --git a/cavis-dnn/cavis-dnn-nn/src/test/java/org/deeplearning4j/nn/layers/HelperUtilsTest.java b/cavis-dnn/cavis-dnn-nn/src/test/java/org/deeplearning4j/nn/layers/HelperUtilsTest.java
index bd05f187f..a3d21fb0c 100644
--- a/cavis-dnn/cavis-dnn-nn/src/test/java/org/deeplearning4j/nn/layers/HelperUtilsTest.java
+++ b/cavis-dnn/cavis-dnn-nn/src/test/java/org/deeplearning4j/nn/layers/HelperUtilsTest.java
@@ -34,7 +34,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
 
 /**
  */
-@DisplayName("Activation Layer Test")
+@DisplayName("Activation ILayer Test")
 public class HelperUtilsTest extends BaseDL4JTest {
 
     @Override
diff --git a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/InplaceParallelInference.java b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/InplaceParallelInference.java
index 20dcd51d9..9f32446ae 100644
--- a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/InplaceParallelInference.java
+++ b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/InplaceParallelInference.java
@@ -29,7 +29,6 @@ import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
-import org.deeplearning4j.parallelism.inference.InferenceMode;
 import org.deeplearning4j.parallelism.inference.LoadBalanceMode;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.exception.ND4JIllegalStateException;
@@ -195,7 +194,7 @@ public class InplaceParallelInference extends ParallelInference {
             for (int e = 0; e < workers; e++) {
                 if (sourceModel instanceof ComputationGraph) {
                     // building configuration with shared parameters
-                    val model = new ComputationGraph(ComputationGraphConfiguration.fromJson(((ComputationGraph) sourceModel).getConfiguration().toJson()));
+                    val model = new ComputationGraph(ComputationGraphConfiguration.fromJson(((ComputationGraph) sourceModel).getComputationGraphConfiguration().toJson()));
                     model.init(params, false);
                     Nd4j.getExecutioner().commit();
 
diff --git a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelInference.java b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelInference.java
index 52a28606e..8547e7b9f 100644
--- a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelInference.java
+++ b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelInference.java
@@ -458,7 +458,7 @@ public class ParallelInference {
             if (protoModel instanceof ComputationGraph) {
                 if (!rootDevice) {
                     this.replicatedModel = new ComputationGraph(ComputationGraphConfiguration
-                            .fromJson(((ComputationGraph) protoModel).getConfiguration().toJson()));
+                            .fromJson(((ComputationGraph) protoModel).getComputationGraphConfiguration().toJson()));
                     this.replicatedModel.init();
 
                     synchronized (locker) {
diff --git a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/trainer/DefaultTrainer.java b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/trainer/DefaultTrainer.java
index a1909795a..be706234f 100644
--- a/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/trainer/DefaultTrainer.java
+++ b/cavis-dnn/cavis-dnn-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/trainer/DefaultTrainer.java
@@ -329,7 +329,7 @@ public class DefaultTrainer extends Thread implements Trainer {
             } else if (originalModel instanceof ComputationGraph) {
                 if (!onRootModel) {
                     ComputationGraphConfiguration conf = ComputationGraphConfiguration
-                                    .fromJson(((ComputationGraph) originalModel).getConfiguration().toJson());
+                                    .fromJson(((ComputationGraph) originalModel).getComputationGraphConfiguration().toJson());
                     conf.setTrainingWorkspaceMode(workspaceMode);
 
                     this.replicatedModel = new ComputationGraph(conf);
@@ -354,7 +354,7 @@ public class DefaultTrainer extends Thread implements Trainer {
                 } else {
                     this.replicatedModel = originalModel;
                     this.replicatedModel.init();
-                    ((ComputationGraph) replicatedModel).getConfiguration().setTrainingWorkspaceMode(workspaceMode);
+                    ((ComputationGraph) replicatedModel).getComputationGraphConfiguration().setTrainingWorkspaceMode(workspaceMode);
                 }
             }
 
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/SparkComputationGraph.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/SparkComputationGraph.java
index 67b120ddf..e460ddc2f 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/SparkComputationGraph.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/SparkComputationGraph.java
@@ -102,7 +102,7 @@ public class SparkComputationGraph extends SparkListenable {
                     TrainingMaster trainingMaster) {
         sc = javaSparkContext;
         this.trainingMaster = trainingMaster;
-        this.conf = network.getConfiguration().clone();
+        this.conf = network.getComputationGraphConfiguration().clone();
         this.network = network;
         this.network.init();
 
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionErrorWithKeyFunction.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionErrorWithKeyFunction.java
index 3fa3312d7..b7da3d143 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionErrorWithKeyFunction.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionErrorWithKeyFunction.java
@@ -56,7 +56,7 @@ public class CGVaeReconstructionErrorWithKeyFunction<K> extends BaseVaeScoreWith
         if (!(l instanceof VariationalAutoencoder)) {
             throw new RuntimeException(
                             "Cannot use CGVaeReconstructionErrorWithKeyFunction on network that doesn't have a VAE "
-                                            + "layer as layer 0. Layer type: " + l.getClass());
+                                            + "layer as layer 0. ILayer type: " + l.getClass());
         }
         return (VariationalAutoencoder) l;
     }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionProbWithKeyFunction.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionProbWithKeyFunction.java
index a71912367..43defe37f 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionProbWithKeyFunction.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/graph/scoring/CGVaeReconstructionProbWithKeyFunction.java
@@ -58,7 +58,7 @@ public class CGVaeReconstructionProbWithKeyFunction<K> extends BaseVaeReconstruc
         if (!(l instanceof VariationalAutoencoder)) {
             throw new RuntimeException(
                             "Cannot use CGVaeReconstructionProbWithKeyFunction on network that doesn't have a VAE "
-                                            + "layer as layer 0. Layer type: " + l.getClass());
+                                            + "layer as layer 0. ILayer type: " + l.getClass());
         }
         return (VariationalAutoencoder) l;
     }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionErrorWithKeyFunction.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionErrorWithKeyFunction.java
index e1c2f760d..a0bcca02b 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionErrorWithKeyFunction.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionErrorWithKeyFunction.java
@@ -59,7 +59,7 @@ public class VaeReconstructionErrorWithKeyFunction<K> extends BaseVaeScoreWithKe
         if (!(l instanceof VariationalAutoencoder)) {
             throw new RuntimeException(
                             "Cannot use VaeReconstructionErrorWithKeyFunction on network that doesn't have a VAE "
-                                            + "layer as layer 0. Layer type: " + l.getClass());
+                                            + "layer as layer 0. ILayer type: " + l.getClass());
         }
         return (VariationalAutoencoder) l;
     }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionProbWithKeyFunction.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionProbWithKeyFunction.java
index 12fbbbeb6..d65084dc5 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionProbWithKeyFunction.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/multilayer/scoring/VaeReconstructionProbWithKeyFunction.java
@@ -59,7 +59,7 @@ public class VaeReconstructionProbWithKeyFunction<K> extends BaseVaeReconstructi
         if (!(l instanceof VariationalAutoencoder)) {
             throw new RuntimeException(
                             "Cannot use VaeReconstructionProbWithKeyFunction on network that doesn't have a VAE "
-                                            + "layer as layer 0. Layer type: " + l.getClass());
+                                            + "layer as layer 0. ILayer type: " + l.getClass());
         }
         return (VariationalAutoencoder) l;
     }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/paramavg/ParameterAveragingTrainingMaster.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/paramavg/ParameterAveragingTrainingMaster.java
index 3a2170bc3..4a0252b28 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/paramavg/ParameterAveragingTrainingMaster.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/main/java/org/deeplearning4j/spark/impl/paramavg/ParameterAveragingTrainingMaster.java
@@ -292,7 +292,7 @@ public class ParameterAveragingTrainingMaster
 
     @Override
     public ParameterAveragingTrainingWorker getWorkerInstance(SparkComputationGraph graph) {
-        NetBroadcastTuple tuple = new NetBroadcastTuple(graph.getNetwork().getConfiguration(),
+        NetBroadcastTuple tuple = new NetBroadcastTuple(graph.getNetwork().getComputationGraphConfiguration(),
                         graph.getNetwork().params(), graph.getNetwork().getUpdater().getStateViewArray());
 
         if (collectTrainingStats)
@@ -731,7 +731,7 @@ public class ParameterAveragingTrainingMaster
                 int numUpdates = averagingFrequency;
                 conf.setIterationCount(conf.getIterationCount() + numUpdates);
             } else {
-                ComputationGraphConfiguration conf = graph.getNetwork().getConfiguration();
+                ComputationGraphConfiguration conf = graph.getNetwork().getComputationGraphConfiguration();
                 int numUpdates = averagingFrequency;
                 conf.setIterationCount(conf.getIterationCount() + numUpdates);
             }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/misc/TestFrozenLayers.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/misc/TestFrozenLayers.java
index 887696af3..c899fae04 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/misc/TestFrozenLayers.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/misc/TestFrozenLayers.java
@@ -118,7 +118,7 @@ public class TestFrozenLayers extends BaseSparkTest {
             boolean isFrozen = entry.getKey().startsWith("0_") || entry.getKey().startsWith("1_");
 
             if (isFrozen) {
-                //Layer should be frozen -> no change
+                //ILayer should be frozen -> no change
                 assertEquals(orig, now, entry.getKey());
             } else {
                 //Not frozen -> should be different
@@ -195,7 +195,7 @@ public class TestFrozenLayers extends BaseSparkTest {
             boolean isFrozen = entry.getKey().startsWith("0_") || entry.getKey().startsWith("1_");
 
             if (isFrozen) {
-                //Layer should be frozen -> no change
+                //ILayer should be frozen -> no change
                 assertEquals(orig, now, entry.getKey());
             } else {
                 //Not frozen -> should be different
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
index 48a30034a..c2c24a617 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-core/src/test/java/org/deeplearning4j/spark/impl/paramavg/TestSparkMultiLayerParameterAveraging.java
@@ -835,12 +835,12 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
 
             JavaRDD<DataSet> rdd = sc.parallelize(list);
 
-            assertEquals(0, sparkNet.getNetwork().getConfiguration().getIterationCount());
+            assertEquals(0, sparkNet.getNetwork().getComputationGraphConfiguration().getIterationCount());
             sparkNet.fit(rdd);
-            assertEquals(minibatchesPerWorkerPerEpoch, sparkNet.getNetwork().getConfiguration().getIterationCount());
+            assertEquals(minibatchesPerWorkerPerEpoch, sparkNet.getNetwork().getComputationGraphConfiguration().getIterationCount());
             sparkNet.fit(rdd);
             assertEquals(2 * minibatchesPerWorkerPerEpoch,
-                            sparkNet.getNetwork().getConfiguration().getIterationCount());
+                            sparkNet.getNetwork().getComputationGraphConfiguration().getIterationCount());
 
             sparkNet.getTrainingMaster().deleteTempFiles(sc);
         }
@@ -1076,11 +1076,11 @@ public class TestSparkMultiLayerParameterAveraging extends BaseSparkTest {
 
         for(int i=0; i<3; i++ ){
             assertEquals(i, sn1.getNetwork().getLayerWiseConfigurations().getEpochCount());
-            assertEquals(i, sn2.getNetwork().getConfiguration().getEpochCount());
+            assertEquals(i, sn2.getNetwork().getComputationGraphConfiguration().getEpochCount());
             sn1.fit(rdd);
             sn2.fit(rdd);
             assertEquals(i+1, sn1.getNetwork().getLayerWiseConfigurations().getEpochCount());
-            assertEquals(i+1, sn2.getNetwork().getConfiguration().getEpochCount());
+            assertEquals(i+1, sn2.getNetwork().getComputationGraphConfiguration().getEpochCount());
         }
     }
 }
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/pw/SharedTrainingWrapper.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/pw/SharedTrainingWrapper.java
index a9e2a213b..7e521f0c1 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/pw/SharedTrainingWrapper.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/pw/SharedTrainingWrapper.java
@@ -375,8 +375,8 @@ public class SharedTrainingWrapper {
                     ((MultiLayerNetwork) model).setIterationCount(ModelParameterServer.getInstance().getStartPosition().getFirst());
                     ((MultiLayerNetwork) model).setEpochCount(ModelParameterServer.getInstance().getStartPosition().getSecond());
                 } else if (originalModel instanceof ComputationGraph) {
-                    ((ComputationGraph) model).getConfiguration().setIterationCount(ModelParameterServer.getInstance().getStartPosition().getFirst());
-                    ((ComputationGraph) model).getConfiguration().setEpochCount(ModelParameterServer.getInstance().getStartPosition().getSecond());
+                    ((ComputationGraph) model).getComputationGraphConfiguration().setIterationCount(ModelParameterServer.getInstance().getStartPosition().getFirst());
+                    ((ComputationGraph) model).getComputationGraphConfiguration().setEpochCount(ModelParameterServer.getInstance().getStartPosition().getSecond());
                 }
 
                 // if we're going to extend iteratation for debugging purposes - let's do that here
@@ -421,7 +421,7 @@ public class SharedTrainingWrapper {
 
                     // ok. attaching accumulator to model
                     if (model instanceof ComputationGraph) {
-                        ((ComputationGraph) originalModel).getConfiguration()
+                        ((ComputationGraph) originalModel).getComputationGraphConfiguration()
                                         .setTrainingWorkspaceMode(trainingConfiguration.getWorkspaceMode());
                         ((ComputationGraph) originalModel).setGradientsAccumulator(accumulator);
                     } else if (model instanceof MultiLayerNetwork) {
diff --git a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/training/SharedTrainingMaster.java b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/training/SharedTrainingMaster.java
index f0b6bc151..1a11d70a5 100644
--- a/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/training/SharedTrainingMaster.java
+++ b/cavis-dnn/cavis-dnn-spark/cavis-dnn-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/training/SharedTrainingMaster.java
@@ -295,7 +295,7 @@ public class SharedTrainingMaster extends BaseTrainingMaster<SharedTrainingResul
 
     @Override
     public SharedTrainingWorker getWorkerInstance(SparkComputationGraph graph) {
-        NetBroadcastTuple tuple = new NetBroadcastTuple(graph.getNetwork().getConfiguration(),
+        NetBroadcastTuple tuple = new NetBroadcastTuple(graph.getNetwork().getComputationGraphConfiguration(),
                         graph.getNetwork().params(), graph.getNetwork().getUpdater().getStateViewArray());
 
         SharedTrainingConfiguration configuration = SharedTrainingConfiguration.builder()
diff --git a/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/BaseStatsListener.java b/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/BaseStatsListener.java
index 3ecc58fd5..e660a8e04 100644
--- a/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/BaseStatsListener.java
+++ b/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/BaseStatsListener.java
@@ -52,7 +52,6 @@ import java.lang.management.GarbageCollectorMXBean;
 import java.lang.management.ManagementFactory;
 import java.lang.management.OperatingSystemMXBean;
 import java.lang.management.RuntimeMXBean;
-import java.lang.reflect.Constructor;
 import java.util.*;
 
 @Slf4j
@@ -223,7 +222,7 @@ public abstract class BaseStatsListener implements RoutingIterationListener {
         if (model instanceof MultiLayerNetwork || model instanceof ComputationGraph)
             return sessionID;
         if (model instanceof Layer) {
-            //Keep in mind MultiLayerNetwork implements Layer also...
+            //Keep in mind MultiLayerNetwork implements ILayer also...
             Layer l = (Layer) model;
             int layerIdx = l.getIndex();
             return sessionID + "_layer" + layerIdx;
@@ -658,7 +657,7 @@ public abstract class BaseStatsListener implements RoutingIterationListener {
                 numParams = net.numParams();
             } else if (model instanceof ComputationGraph) {
                 ComputationGraph cg = ((ComputationGraph) model);
-                jsonConf = cg.getConfiguration().toJson();
+                jsonConf = cg.getComputationGraphConfiguration().toJson();
                 numLayers = cg.getNumLayers();
                 numParams = cg.numParams();
             } else if (model instanceof Layer) {
diff --git a/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/impl/SbeStatsReport.java b/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/impl/SbeStatsReport.java
index 39c34a4b9..6ffe27c41 100644
--- a/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/impl/SbeStatsReport.java
+++ b/cavis-ui/cavis-ui-model/src/main/java/org/deeplearning4j/ui/model/stats/impl/SbeStatsReport.java
@@ -494,7 +494,7 @@ public class SbeStatsReport implements StatsReport, AgronaPersistable {
             bufferSize += SbeUtil.toBytes(true, s).length; //Content
         }
 
-        //Layer names group
+        //ILayer names group
         bufferSize += 4; //Header; always present
         List<String> layerNames = getlayerNames();
         for (String s : layerNames) {
@@ -728,7 +728,7 @@ public class SbeStatsReport implements StatsReport, AgronaPersistable {
             pne.next().paramName(s);
         }
 
-        //Layer names
+        //ILayer names
         List<String> layerNames = getlayerNames();
         UpdateEncoder.LayerNamesEncoder lne = ue.layerNamesCount(layerNames.size());
         for (String s : layerNames) {
diff --git a/cavis-ui/cavis-ui-vertx/src/main/java/org/deeplearning4j/ui/module/train/TrainModuleUtils.java b/cavis-ui/cavis-ui-vertx/src/main/java/org/deeplearning4j/ui/module/train/TrainModuleUtils.java
index ff6f00901..274e670f6 100644
--- a/cavis-ui/cavis-ui-vertx/src/main/java/org/deeplearning4j/ui/module/train/TrainModuleUtils.java
+++ b/cavis-ui/cavis-ui-vertx/src/main/java/org/deeplearning4j/ui/module/train/TrainModuleUtils.java
@@ -182,7 +182,7 @@ public class TrainModuleUtils {
                 long inputSize = (i == 0 ? va.getNIn() : encLayerSizes[i - 1]);
                 long outputSize = encLayerSizes[i];
                 encoderInfo.put("Input Size", String.valueOf(inputSize));
-                encoderInfo.put("Layer Size", String.valueOf(outputSize));
+                encoderInfo.put("ILayer Size", String.valueOf(outputSize));
                 encoderInfo.put("Num Parameters", String.valueOf((inputSize + 1) * outputSize));
                 encoderInfo.put("Activation Function", va.getActivationFn().toString());
                 layerInfo.add(encoderInfo);
@@ -197,7 +197,7 @@ public class TrainModuleUtils {
             long inputSize = encLayerSizes[encLayerSizes.length - 1];
             long outputSize = va.getNOut();
             latentInfo.put("Input Size", String.valueOf(inputSize));
-            latentInfo.put("Layer Size", String.valueOf(outputSize));
+            latentInfo.put("ILayer Size", String.valueOf(outputSize));
             latentInfo.put("Num Parameters", String.valueOf((inputSize + 1) * outputSize * 2));
             latentInfo.put("Activation Function", va.getPzxActivationFn().toString());
             layerInfo.add(latentInfo);
@@ -216,7 +216,7 @@ public class TrainModuleUtils {
                 inputSize = (i == 0 ? va.getNOut() : decLayerSizes[i - 1]);
                 outputSize = decLayerSizes[i];
                 decoderInfo.put("Input Size", String.valueOf(inputSize));
-                decoderInfo.put("Layer Size", String.valueOf(outputSize));
+                decoderInfo.put("ILayer Size", String.valueOf(outputSize));
                 decoderInfo.put("Num Parameters", String.valueOf((inputSize + 1) * outputSize));
                 decoderInfo.put("Activation Function", va.getActivationFn().toString());
                 layerInfo.add(decoderInfo);
@@ -231,7 +231,7 @@ public class TrainModuleUtils {
             inputSize = decLayerSizes[decLayerSizes.length - 1];
             outputSize = va.getNIn();
             reconstructionInfo.put("Input Size", String.valueOf(inputSize));
-            reconstructionInfo.put("Layer Size", String.valueOf(outputSize));
+            reconstructionInfo.put("ILayer Size", String.valueOf(outputSize));
             reconstructionInfo.put("Num Parameters", String
                     .valueOf((inputSize + 1) * va.getOutputDistribution().distributionInputSize((int) va.getNIn())));
             reconstructionInfo.put("Distribution", va.getOutputDistribution().toString());
diff --git a/cavis-ui/cavis-ui-vertx/src/main/resources/templates/TrainingModel.html.ftl b/cavis-ui/cavis-ui-vertx/src/main/resources/templates/TrainingModel.html.ftl
index 859aae287..51d63af6b 100644
--- a/cavis-ui/cavis-ui-vertx/src/main/resources/templates/TrainingModel.html.ftl
+++ b/cavis-ui/cavis-ui-vertx/src/main/resources/templates/TrainingModel.html.ftl
@@ -103,7 +103,7 @@
                                 <div id="layers"></div>
                         </div>
 
-                <!-- Start Layer Details -->
+                <!-- Start ILayer Details -->
                 <div class="col" id="layerDetails" style="width:50pc">
 
                         <div class="box">
@@ -179,7 +179,7 @@
                                 </div>
 
                                 </div>
-                                <!-- End Layer Details-->
+                                <!-- End ILayer Details-->
 
                         <!-- Begin Zero State -->
                         <div class="col" id="zeroState">
@@ -244,7 +244,7 @@
         <script src="/assets/webjars/github-com-jboesch-Gritter/1.7.4/jquery.gritter.js"></script>
 
         <script src="/assets/js/train/model.js"></script> <!-- Charts and tables are generated here! -->
-        <script src="/assets/js/train/model-graph.js"></script> <!-- Layer graph generated here! -->
+        <script src="/assets/js/train/model-graph.js"></script> <!-- ILayer graph generated here! -->
         <script src="/assets/js/train/train.js"></script> <!-- Common (lang selection, etc) -->
         <script src="/assets/js/counter.js"></script>
 
diff --git a/cavis-zoo/cavis-zoo-models/src/test/java/org/deeplearning4j/zoo/TestUtils.java b/cavis-zoo/cavis-zoo-models/src/test/java/org/deeplearning4j/zoo/TestUtils.java
index a61ae386d..44d9dff3c 100644
--- a/cavis-zoo/cavis-zoo-models/src/test/java/org/deeplearning4j/zoo/TestUtils.java
+++ b/cavis-zoo/cavis-zoo-models/src/test/java/org/deeplearning4j/zoo/TestUtils.java
@@ -65,7 +65,7 @@ public class TestUtils {
             ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
             ComputationGraph restored = ModelSerializer.restoreComputationGraph(bais, true);
 
-            assertEquals(net.getConfiguration(), restored.getConfiguration());
+            assertEquals(net.getComputationGraphConfiguration(), restored.getComputationGraphConfiguration());
             assertEquals(net.params(), restored.params());
 
             return restored;
diff --git a/settings.gradle b/settings.gradle
index 80b29bef8..d7875c751 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -100,7 +100,7 @@ include ':cavis-dnn:cavis-dnn-data:cavis-dnn-data-utility-iterators'
 include ':cavis-dnn:cavis-dnn-modelimport'
 include ':cavis-dnn:cavis-dnn-nlp'
 include ':cavis-dnn:cavis-dnn-nn'
-include ':cavis-dnn:cavis-dnn-nn-api'
+//include ':cavis-dnn:cavis-dnn-nn-api'
 include ':cavis-dnn:cavis-dnn-nn-parent'
 include ':cavis-dnn:cavis-dnn-nn-parent:cavis-dnn-nn-server'
 include ':cavis-dnn:cavis-dnn-nn-parent:cavis-dnn-nn-client'