DL4J NWC support for RNNs (#379)

* merge conf * merge conf * conf fix * NWC initial * revert pom.xml * revert pom.xml * default NCW * bidirectional+some tests * RNNOutputLayer, RNNLossLayer, Graves + tests * rnn tests * LastTimeStep + tests * masking + tests * graves, rnnoutput, rnnloss * nwc timeseries reverse * more tests * bi-gravelstm test * fixes * rnn df tests basic * bug fix: cudnn fallback * bug fix * misc * gravelstm tests * preprocessor fixes * TimeDistributed * more tests * RnnLossLayer builder def val * copyright headers * Remove debug println Signed-off-by: Alex Black <blacka101@gmail.com> * Small fix + test naming Signed-off-by: Alex Black <blacka101@gmail.com> * Parameterized test name Signed-off-by: Alex Black <blacka101@gmail.com> * fix LastTimeStep masked * Fix MaskZero mask datatype issue Signed-off-by: Alex Black <blacka101@gmail.com> * rem println * javadoc * Fixes Signed-off-by: Alex Black <blacka101@gmail.com> Co-authored-by: Alex Black <blacka101@gmail.com>
2020-04-23 06:16:44 +04:00 · 2020-04-23 06:16:44 +04:00 · 2ecabde500
commit 2ecabde500
parent 032b97912e
48 changed files with 1256 additions and 278 deletions
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/RnnGradientChecks.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/gradientcheck/RnnGradientChecks.java
@ -338,7 +338,7 @@ public class RnnGradientChecks extends BaseDL4JTest {
                        .weightInit(WeightInit.XAVIER)
                        .list()
                        .layer(new LSTM.Builder().nOut(layerSize).build())
-                        .layer(new TimeDistributed(new DenseLayer.Builder().nOut(layerSize).activation(Activation.SOFTMAX).build(), 2))
+                        .layer(new TimeDistributed(new DenseLayer.Builder().nOut(layerSize).activation(Activation.SOFTMAX).build()))
                        .layer(new RnnOutputLayer.Builder().nOut(nOut).activation(Activation.SOFTMAX)
                                .lossFunction(LossFunctions.LossFunction.MCXENT).build())
                        .setInputType(InputType.recurrent(nIn))
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/dtypes/DTypeTests.java
@ -819,7 +819,7 @@ public class DTypeTests extends BaseDL4JTest {
                            .layer(new DenseLayer.Builder().nOut(5).build())
                            .layer(new GravesBidirectionalLSTM.Builder().nIn(5).nOut(5).activation(Activation.TANH).build())
                            .layer(new Bidirectional(new LSTM.Builder().nIn(5).nOut(5).activation(Activation.TANH).build()))
-                            .layer(new TimeDistributed(new DenseLayer.Builder().nIn(10).nOut(5).activation(Activation.TANH).build(), 2))
+                            .layer(new TimeDistributed(new DenseLayer.Builder().nIn(10).nOut(5).activation(Activation.TANH).build()))
                            .layer(new SimpleRnn.Builder().nIn(5).nOut(5).build())
                            .layer(new MaskZeroLayer.Builder().underlying(new SimpleRnn.Builder().nIn(5).nOut(5).build()).maskValue(0.0).build())
                            .layer(secondLast)
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/BidirectionalTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/BidirectionalTest.java
@ -24,10 +24,7 @@ import org.deeplearning4j.earlystopping.saver.InMemoryModelSaver;
 import org.deeplearning4j.earlystopping.scorecalc.DataSetLossCalculator;
 import org.deeplearning4j.earlystopping.termination.MaxEpochsTerminationCondition;
 import org.deeplearning4j.earlystopping.trainer.EarlyStoppingGraphTrainer;
-import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
-import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
-import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
-import org.deeplearning4j.nn.conf.WorkspaceMode;
+import org.deeplearning4j.nn.conf.*;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM;
 import org.deeplearning4j.nn.conf.layers.GravesLSTM;
@ -45,6 +42,8 @@ import org.deeplearning4j.nn.weights.WeightInit;
 import org.deeplearning4j.util.ModelSerializer;
 import org.deeplearning4j.util.TimeSeriesUtils;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -61,12 +60,22 @@ import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;

+import static org.deeplearning4j.nn.conf.RNNFormat.NCW;
 import static org.junit.Assert.assertEquals;

@Slf4j
+@RunWith(Parameterized.class)
 public class BidirectionalTest extends BaseDL4JTest {

+    private RNNFormat rnnDataFormat;

+    public BidirectionalTest(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void compareImplementations(){
        for(WorkspaceMode wsm : WorkspaceMode.values()) {
@ -82,9 +91,9 @@ public class BidirectionalTest extends BaseDL4JTest {
                    .inferenceWorkspaceMode(wsm)
                    .updater(new Adam())
                    .list()
-                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()))
-                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()))
-                    .layer(new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE)
+                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()))
+                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()))
+                    .layer(new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE).dataFormat(rnnDataFormat)
                            .nIn(10).nOut(10).build())
                    .build();

@ -95,9 +104,9 @@ public class BidirectionalTest extends BaseDL4JTest {
                    .inferenceWorkspaceMode(wsm)
                    .updater(new Adam())
                    .list()
-                    .layer(new GravesBidirectionalLSTM.Builder().nIn(10).nOut(10).build())
-                    .layer(new GravesBidirectionalLSTM.Builder().nIn(10).nOut(10).build())
-                    .layer(new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE)
+                    .layer(new GravesBidirectionalLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
+                    .layer(new GravesBidirectionalLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
+                    .layer(new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE).dataFormat(rnnDataFormat)
                            .nIn(10).nOut(10).build())
                    .build();

@ -116,15 +125,24 @@ public class BidirectionalTest extends BaseDL4JTest {

            net2.setParams(net1.params());  //Assuming exact same layout here...

-            INDArray in = Nd4j.rand(new int[]{3, 10, 5});
+            INDArray in;
+            if (rnnDataFormat == NCW){
+                in = Nd4j.rand(new int[]{3, 10, 5});
+            }else{
+                in = Nd4j.rand(new int[]{3, 5, 10});
+            }

            INDArray out1 = net1.output(in);
            INDArray out2 = net2.output(in);

            assertEquals(out1, out2);

-            INDArray labels = Nd4j.rand(new int[]{3, 10, 5});
-
+            INDArray labels;
+            if (rnnDataFormat == NCW){
+                labels = Nd4j.rand(new int[]{3, 10, 5});
+            }else{
+                labels = Nd4j.rand(new int[]{3, 5, 10});
+            }
            net1.setInput(in);
            net1.setLabels(labels);

@ -276,17 +294,22 @@ public class BidirectionalTest extends BaseDL4JTest {
                    .inferenceWorkspaceMode(wsm)
                    .updater(new Adam())
                    .list()
-                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()))
-                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()))
+                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()))
+                    .layer(new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()))
                    .layer(new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE)
-                            .nIn(10).nOut(10).build())
+                            .nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
                    .build();

            MultiLayerNetwork net1 = new MultiLayerNetwork(conf1);
            net1.init();

-            INDArray in = Nd4j.rand(new int[]{3, 10, 5});
-            INDArray labels = Nd4j.rand(new int[]{3, 10, 5});
+            INDArray in;
+            INDArray labels;
+
+            long[] inshape = rnnDataFormat == NCW ? new long[]{3, 10, 5} : new long[]{3, 5, 10};
+
+            in = Nd4j.rand(inshape);
+            labels = Nd4j.rand(inshape);

            net1.fit(in, labels);

@ -300,8 +323,8 @@ public class BidirectionalTest extends BaseDL4JTest {
            MultiLayerNetwork net2 = ModelSerializer.restoreMultiLayerNetwork(new ByteArrayInputStream(bytes), true);


-            in = Nd4j.rand(new int[]{3, 10, 5});
-            labels = Nd4j.rand(new int[]{3, 10, 5});
+            in = Nd4j.rand(inshape);
+            labels = Nd4j.rand(inshape);

            INDArray out1 = net1.output(in);
            INDArray out2 = net2.output(in);
@ -338,18 +361,18 @@ public class BidirectionalTest extends BaseDL4JTest {
                    .updater(new Adam())
                    .graphBuilder()
                    .addInputs("in")
-                    .layer("0", new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()), "in")
-                    .layer("1", new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).build()), "0")
-                    .layer("2", new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE)
+                    .layer("0", new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()), "in")
+                    .layer("1", new Bidirectional(Bidirectional.Mode.ADD, new GravesLSTM.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()), "0")
+                    .layer("2", new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE).dataFormat(rnnDataFormat)
                            .nIn(10).nOut(10).build(), "1")
                    .setOutputs("2")
                    .build();

            ComputationGraph net1 = new ComputationGraph(conf1);
            net1.init();
-
-            INDArray in = Nd4j.rand(new int[]{3, 10, 5});
-            INDArray labels = Nd4j.rand(new int[]{3, 10, 5});
+            long[] inshape = (rnnDataFormat == NCW)? new long[]{3, 10, 5}: new long[]{3, 5, 10};
+            INDArray in = Nd4j.rand(inshape);
+            INDArray labels = Nd4j.rand(inshape);

            net1.fit(new DataSet(in, labels));

@ -363,8 +386,8 @@ public class BidirectionalTest extends BaseDL4JTest {
            ComputationGraph net2 = ModelSerializer.restoreComputationGraph(new ByteArrayInputStream(bytes), true);


-            in = Nd4j.rand(new int[]{3, 10, 5});
-            labels = Nd4j.rand(new int[]{3, 10, 5});
+            in = Nd4j.rand(inshape);
+            labels = Nd4j.rand(inshape);

            INDArray out1 = net1.outputSingle(in);
            INDArray out2 = net2.outputSingle(in);
@ -394,8 +417,8 @@ public class BidirectionalTest extends BaseDL4JTest {
            Bidirectional.Mode[] modes = new Bidirectional.Mode[]{Bidirectional.Mode.CONCAT, Bidirectional.Mode.ADD,
                    Bidirectional.Mode.AVERAGE, Bidirectional.Mode.MUL};

-
-            INDArray in = Nd4j.rand(new int[]{3, 10, 6});
+            long[] inshape = rnnDataFormat == NCW ? new long[]{3, 10, 6} : new long[]{3, 6, 10};
+            INDArray in = Nd4j.rand(inshape);

            for (Bidirectional.Mode m : modes) {
                MultiLayerConfiguration conf1 = new NeuralNetConfiguration.Builder()
@ -406,7 +429,7 @@ public class BidirectionalTest extends BaseDL4JTest {
                        .inferenceWorkspaceMode(wsm)
                        .updater(new Adam())
                        .list()
-                        .layer(new Bidirectional(m, new SimpleRnn.Builder().nIn(10).nOut(10).build()))
+                        .layer(new Bidirectional(m, new SimpleRnn.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()))
                        .build();

                MultiLayerNetwork net1 = new MultiLayerNetwork(conf1);
@ -418,7 +441,7 @@ public class BidirectionalTest extends BaseDL4JTest {
                        .weightInit(WeightInit.XAVIER)
                        .updater(new Adam())
                        .list()
-                        .layer(new SimpleRnn.Builder().nIn(10).nOut(10).build())
+                        .layer(new SimpleRnn.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
                        .build();

                MultiLayerNetwork net2 = new MultiLayerNetwork(conf2.clone());
@ -434,11 +457,10 @@ public class BidirectionalTest extends BaseDL4JTest {
                net3.setParam("0_RW", net1.getParam("0_bRW"));
                net3.setParam("0_b", net1.getParam("0_bb"));

-                INDArray inReverse = TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT);
-
+                INDArray inReverse = TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT, rnnDataFormat);
                INDArray out1 = net1.output(in);
                INDArray out2 = net2.output(in);
-                INDArray out3 = TimeSeriesUtils.reverseTimeSeries(net3.output(inReverse), LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT);
+                INDArray out3 = TimeSeriesUtils.reverseTimeSeries(net3.output(inReverse), LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT, rnnDataFormat);

                INDArray outExp;
                switch (m) {
@ -452,7 +474,7 @@ public class BidirectionalTest extends BaseDL4JTest {
                        outExp = out2.add(out3).muli(0.5);
                        break;
                    case CONCAT:
-                        outExp = Nd4j.concat(1, out2, out3);
+                        outExp = Nd4j.concat((rnnDataFormat == NCW)?1:2, out2, out3);
                        break;
                    default:
                        throw new RuntimeException();
@ -464,25 +486,25 @@ public class BidirectionalTest extends BaseDL4JTest {
                //Check gradients:
                if (m == Bidirectional.Mode.ADD || m == Bidirectional.Mode.CONCAT) {

-                    INDArray eps = Nd4j.rand(new int[]{3, 10, 6});
+                    INDArray eps = Nd4j.rand(inshape);

                    INDArray eps1;
                    if (m == Bidirectional.Mode.CONCAT) {
-                        eps1 = Nd4j.concat(1, eps, eps);
+                        eps1 = Nd4j.concat((rnnDataFormat == NCW)?1:2, eps, eps);
                    } else {
                        eps1 = eps;
                    }

                    net1.setInput(in);
                    net2.setInput(in);
-                    net3.setInput(TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT));
+                    net3.setInput(TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT, rnnDataFormat));
                    net1.feedForward(true, false);
                    net2.feedForward(true, false);
                    net3.feedForward(true, false);

                    Pair<Gradient, INDArray> p1 = net1.backpropGradient(eps1, LayerWorkspaceMgr.noWorkspaces());
                    Pair<Gradient, INDArray> p2 = net2.backpropGradient(eps, LayerWorkspaceMgr.noWorkspaces());
-                    Pair<Gradient, INDArray> p3 = net3.backpropGradient(TimeSeriesUtils.reverseTimeSeries(eps, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT), LayerWorkspaceMgr.noWorkspaces());
+                    Pair<Gradient, INDArray> p3 = net3.backpropGradient(TimeSeriesUtils.reverseTimeSeries(eps, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT, rnnDataFormat), LayerWorkspaceMgr.noWorkspaces());
                    Gradient g1 = p1.getFirst();
                    Gradient g2 = p2.getFirst();
                    Gradient g3 = p3.getFirst();
@ -520,7 +542,9 @@ public class BidirectionalTest extends BaseDL4JTest {
                    Bidirectional.Mode.AVERAGE, Bidirectional.Mode.MUL};


-            INDArray in = Nd4j.rand(new int[]{3, 10, 6});
+            long[] inshape = rnnDataFormat == NCW ? new long[]{3, 10, 6} : new long[]{3, 6, 10};
+            INDArray in = Nd4j.rand(inshape);
+

            for (Bidirectional.Mode m : modes) {
                ComputationGraphConfiguration conf1 = new NeuralNetConfiguration.Builder()
@ -532,7 +556,7 @@ public class BidirectionalTest extends BaseDL4JTest {
                        .updater(new Adam())
                        .graphBuilder()
                        .addInputs("in")
-                        .layer("0", new Bidirectional(m, new SimpleRnn.Builder().nIn(10).nOut(10).build()), "in")
+                        .layer("0", new Bidirectional(m, new SimpleRnn.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build()), "in")
                        .setOutputs("0")
                        .build();

@ -546,7 +570,7 @@ public class BidirectionalTest extends BaseDL4JTest {
                        .updater(new Adam())
                        .graphBuilder()
                        .addInputs("in")
-                        .layer("0", new SimpleRnn.Builder().nIn(10).nOut(10).build(), "in")
+                        .layer("0", new SimpleRnn.Builder().nIn(10).nOut(10).dataFormat(rnnDataFormat).build(), "in")
                        .setOutputs("0")
                        .build();

@ -566,9 +590,20 @@ public class BidirectionalTest extends BaseDL4JTest {

                INDArray out1 = net1.outputSingle(in);
                INDArray out2 = net2.outputSingle(in);
-                INDArray out3 = TimeSeriesUtils.reverseTimeSeries(net3.outputSingle(
-                        TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT)),
-                        LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT);
+                INDArray out3;
+                INDArray inReverse;
+                if (rnnDataFormat == RNNFormat.NWC){
+                    inReverse = TimeSeriesUtils.reverseTimeSeries(in.permute(0, 2, 1), LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT).permute(0, 2, 1);
+                    out3 = net3.outputSingle(inReverse);
+                    out3 = TimeSeriesUtils.reverseTimeSeries(out3.permute(0, 2, 1), LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT).permute(0, 2, 1);
+
+                }
+                else{
+                    inReverse = TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT);
+                    out3 = net3.outputSingle(inReverse);
+                    out3 = TimeSeriesUtils.reverseTimeSeries(out3, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT);
+
+                }

                INDArray outExp;
                switch (m) {
@ -582,7 +617,9 @@ public class BidirectionalTest extends BaseDL4JTest {
                        outExp = out2.add(out3).muli(0.5);
                        break;
                    case CONCAT:
-                        outExp = Nd4j.concat(1, out2, out3);
+                        System.out.println(out2.shapeInfoToString());
+                        System.out.println(out3.shapeInfoToString());
+                        outExp = Nd4j.concat((rnnDataFormat == NCW)?1:2, out2, out3);
                        break;
                    default:
                        throw new RuntimeException();
@ -594,22 +631,26 @@ public class BidirectionalTest extends BaseDL4JTest {
                //Check gradients:
                if (m == Bidirectional.Mode.ADD || m == Bidirectional.Mode.CONCAT) {

-                    INDArray eps = Nd4j.rand(new int[]{3, 10, 6});
+                    INDArray eps = Nd4j.rand(inshape);

                    INDArray eps1;
                    if (m == Bidirectional.Mode.CONCAT) {
-                        eps1 = Nd4j.concat(1, eps, eps);
+                        eps1 = Nd4j.concat((rnnDataFormat == NCW)?1:2, eps, eps);
                    } else {
                        eps1 = eps;
                    }

+                    INDArray epsReversed = (rnnDataFormat == NCW)?
+                           TimeSeriesUtils.reverseTimeSeries(eps, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT):
+                           TimeSeriesUtils.reverseTimeSeries(eps.permute(0, 2, 1), LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT)
+                           .permute(0, 2, 1);
                    net1.outputSingle(true, false, in);
                    net2.outputSingle(true, false, in);
-                    net3.outputSingle(true, false, TimeSeriesUtils.reverseTimeSeries(in, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT));
+                    net3.outputSingle(true, false, inReverse);

                    Gradient g1 = net1.backpropGradient(eps1);
                    Gradient g2 = net2.backpropGradient(eps);
-                    Gradient g3 = net3.backpropGradient(TimeSeriesUtils.reverseTimeSeries(eps, LayerWorkspaceMgr.noWorkspaces(), ArrayType.INPUT));
+                    Gradient g3 = net3.backpropGradient(epsReversed);

                    for (boolean updates : new boolean[]{false, true}) {
                        if (updates) {
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java
@ -23,6 +23,7 @@ import org.deeplearning4j.nn.api.OptimizationAlgorithm;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.distribution.UniformDistribution;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
@ -31,6 +32,8 @@ import org.deeplearning4j.nn.params.GravesLSTMParamInitializer;
 import org.deeplearning4j.nn.weights.WeightInit;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.activations.impl.ActivationSigmoid;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -42,10 +45,18 @@ import org.nd4j.linalg.primitives.Pair;

 import static org.junit.Assert.*;

-
+@RunWith(Parameterized.class)
 public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
    private double score = 0.0;
+    private RNNFormat rnnDataFormat;

+    public GravesBidirectionalLSTMTest(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void testBidirectionalLSTMGravesForwardBasic() {
        //Very basic test of forward prop. of LSTM layer with a time series.
@ -55,7 +66,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {

        final NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
                        .layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder().nIn(nIn)
-                                        .nOut(nHiddenUnits).activation(Activation.TANH).build())
+                                        .nOut(nHiddenUnits).dataFormat(rnnDataFormat).activation(Activation.TANH).build())
                        .build();

        val numParams = conf.getLayer().initializer().numParams(conf);
@ -65,22 +76,41 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {

        //Data: has shape [miniBatchSize,nIn,timeSeriesLength];
        //Output/activations has shape [miniBatchsize,nHiddenUnits,timeSeriesLength];
+        if (rnnDataFormat == RNNFormat.NCW){
+            final INDArray dataSingleExampleTimeLength1 = Nd4j.ones(1, nIn, 1);
+            final INDArray activations1 = layer.activate(dataSingleExampleTimeLength1, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations1.shape(), new long[] {1, nHiddenUnits, 1});

-        final INDArray dataSingleExampleTimeLength1 = Nd4j.ones(1, nIn, 1);
-        final INDArray activations1 = layer.activate(dataSingleExampleTimeLength1, false, LayerWorkspaceMgr.noWorkspaces());
-        assertArrayEquals(activations1.shape(), new long[] {1, nHiddenUnits, 1});
+            final INDArray dataMultiExampleLength1 = Nd4j.ones(10, nIn, 1);
+            final INDArray activations2 = layer.activate(dataMultiExampleLength1, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations2.shape(), new long[] {10, nHiddenUnits, 1});

-        final INDArray dataMultiExampleLength1 = Nd4j.ones(10, nIn, 1);
-        final INDArray activations2 = layer.activate(dataMultiExampleLength1, false, LayerWorkspaceMgr.noWorkspaces());
-        assertArrayEquals(activations2.shape(), new long[] {10, nHiddenUnits, 1});
+            final INDArray dataSingleExampleLength12 = Nd4j.ones(1, nIn, 12);
+            final INDArray activations3 = layer.activate(dataSingleExampleLength12, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations3.shape(), new long[] {1, nHiddenUnits, 12});

-        final INDArray dataSingleExampleLength12 = Nd4j.ones(1, nIn, 12);
-        final INDArray activations3 = layer.activate(dataSingleExampleLength12, false, LayerWorkspaceMgr.noWorkspaces());
-        assertArrayEquals(activations3.shape(), new long[] {1, nHiddenUnits, 12});
+            final INDArray dataMultiExampleLength15 = Nd4j.ones(10, nIn, 15);
+            final INDArray activations4 = layer.activate(dataMultiExampleLength15, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations4.shape(), new long[] {10, nHiddenUnits, 15});
+        }
+        else{
+            final INDArray dataSingleExampleTimeLength1 = Nd4j.ones(1, 1, nIn);
+            final INDArray activations1 = layer.activate(dataSingleExampleTimeLength1, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations1.shape(), new long[] {1, 1, nHiddenUnits});
+
+            final INDArray dataMultiExampleLength1 = Nd4j.ones(10, 1, nIn);
+            final INDArray activations2 = layer.activate(dataMultiExampleLength1, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations2.shape(), new long[] {10, 1, nHiddenUnits});
+
+            final INDArray dataSingleExampleLength12 = Nd4j.ones(1, 12, nIn);
+            final INDArray activations3 = layer.activate(dataSingleExampleLength12, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations3.shape(), new long[] {1, 12, nHiddenUnits});
+
+            final INDArray dataMultiExampleLength15 = Nd4j.ones(10, 15, nIn);
+            final INDArray activations4 = layer.activate(dataMultiExampleLength15, false, LayerWorkspaceMgr.noWorkspaces());
+            assertArrayEquals(activations4.shape(), new long[] {10, 15, nHiddenUnits});
+        }

-        final INDArray dataMultiExampleLength15 = Nd4j.ones(10, nIn, 15);
-        final INDArray activations4 = layer.activate(dataMultiExampleLength15, false, LayerWorkspaceMgr.noWorkspaces());
-        assertArrayEquals(activations4.shape(), new long[] {10, nHiddenUnits, 15});
    }

    @Test
@ -94,14 +124,15 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        testGravesBackwardBasicHelper(13, 3, 17, 1, 1); //Edge case: both miniBatchSize = 1 and timeSeriesLength = 1
    }

-    private static void testGravesBackwardBasicHelper(int nIn, int nOut, int lstmNHiddenUnits, int miniBatchSize,
+    private void testGravesBackwardBasicHelper(int nIn, int nOut, int lstmNHiddenUnits, int miniBatchSize,
                    int timeSeriesLength) {

-        INDArray inputData = Nd4j.ones(miniBatchSize, nIn, timeSeriesLength);
+        INDArray inputData = (rnnDataFormat == RNNFormat.NCW)?Nd4j.ones(miniBatchSize, nIn, timeSeriesLength):
+                Nd4j.ones(miniBatchSize, timeSeriesLength, nIn);

        NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
                        .layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder().nIn(nIn)
-                                        .nOut(lstmNHiddenUnits)
+                                        .nOut(lstmNHiddenUnits).dataFormat(rnnDataFormat)
                                        .dist(new UniformDistribution(0, 1)).activation(Activation.TANH).build())
                        .build();

@ -114,7 +145,8 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        lstm.activate(inputData, false, LayerWorkspaceMgr.noWorkspaces());
        assertNotNull(lstm.input());

-        INDArray epsilon = Nd4j.ones(miniBatchSize, lstmNHiddenUnits, timeSeriesLength);
+        INDArray epsilon =(rnnDataFormat == RNNFormat.NCW)? Nd4j.ones(miniBatchSize, lstmNHiddenUnits, timeSeriesLength):
+                Nd4j.ones(miniBatchSize, timeSeriesLength, lstmNHiddenUnits);

        Pair<Gradient, INDArray> out = lstm.backpropGradient(epsilon, LayerWorkspaceMgr.noWorkspaces());
        Gradient outGradient = out.getFirst();
@ -147,7 +179,11 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        assertArrayEquals(recurrentWeightGradientB.shape(), new long[] {lstmNHiddenUnits, 4 * lstmNHiddenUnits + 3});

        assertNotNull(nextEpsilon);
-        assertArrayEquals(nextEpsilon.shape(), new long[] {miniBatchSize, nIn, timeSeriesLength});
+        if (rnnDataFormat == RNNFormat.NCW) {
+            assertArrayEquals(nextEpsilon.shape(), new long[]{miniBatchSize, nIn, timeSeriesLength});
+        }else{
+            assertArrayEquals(nextEpsilon.shape(), new long[]{miniBatchSize, timeSeriesLength, nIn });
+        }

        //Check update:
        for (String s : outGradient.gradientForVariable().keySet()) {
@ -226,7 +262,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {

        final NeuralNetConfiguration confBidirectional = new NeuralNetConfiguration.Builder()
                        .layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder().nIn(nIn)
-                                        .nOut(layerSize)
+                                        .nOut(layerSize).dataFormat(rnnDataFormat)
                                        .dist(new UniformDistribution(-0.1, 0.1)).activation(Activation.TANH).build())
                        .build();

@ -237,7 +273,8 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
                        .instantiate(confBidirectional, null, 0, params, true, params.dataType());


-        final INDArray sig = Nd4j.rand(new int[] {miniBatchSize, nIn, timeSeriesLength});
+        final INDArray sig = (rnnDataFormat == RNNFormat.NCW)?Nd4j.rand(new int[] {miniBatchSize, nIn, timeSeriesLength}):
+                Nd4j.rand(new int[] {miniBatchSize, timeSeriesLength, nIn});

        final INDArray act1 = bidirectionalLSTM.activate(sig, false, LayerWorkspaceMgr.noWorkspaces());

@ -265,13 +302,13 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        final NeuralNetConfiguration confBidirectional =
                        new NeuralNetConfiguration.Builder()
                                        .layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder()
-                                                        .nIn(nIn).nOut(layerSize)
+                                                        .nIn(nIn).nOut(layerSize).dataFormat(rnnDataFormat)
                                                        .dist(new UniformDistribution(-0.1, 0.1))
                                                        .activation(Activation.TANH).updater(new NoOp()).build())
                                        .build();

        final NeuralNetConfiguration confForwards = new NeuralNetConfiguration.Builder()
-                        .layer(new org.deeplearning4j.nn.conf.layers.GravesLSTM.Builder().nIn(nIn).nOut(layerSize)
+                        .layer(new org.deeplearning4j.nn.conf.layers.GravesLSTM.Builder().nIn(nIn).nOut(layerSize).dataFormat(rnnDataFormat)
                                        .weightInit(WeightInit.ZERO).activation(Activation.TANH).build())
                        .build();

@ -290,9 +327,16 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
                        Nd4j.create(1, confForwards.getLayer().initializer().numParams(confForwards)));


-        final INDArray sig = Nd4j.rand(new int[] {miniBatchSize, nIn, timeSeriesLength});
+        final INDArray sig = (rnnDataFormat == RNNFormat.NCW)?Nd4j.rand(new int[] {miniBatchSize, nIn, timeSeriesLength}):
+                Nd4j.rand(new int[] {miniBatchSize, timeSeriesLength, nIn});
        final INDArray sigb = sig.dup();
-        reverseColumnsInPlace(sigb.slice(0));
+
+        if (rnnDataFormat == RNNFormat.NCW) {
+            reverseColumnsInPlace(sigb.slice(0));
+        }
+        else{
+            reverseColumnsInPlace(sigb.slice(0).permute(1, 0));
+        }

        final INDArray recurrentWeightsF = bidirectionalLSTM
                        .getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS);
@ -345,10 +389,14 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {

        assertArrayEquals(activation1.data().asFloat(), activation2.data().asFloat(), 1e-5f);

-        final INDArray randSig = Nd4j.rand(new int[] {1, layerSize, timeSeriesLength});
-        final INDArray randSigBackwards = randSig.dup();
-        reverseColumnsInPlace(randSigBackwards.slice(0));
-
+        final INDArray randSig = (rnnDataFormat == RNNFormat.NCW)?Nd4j.rand(new int[] {1, layerSize, timeSeriesLength}):
+                Nd4j.rand(new int[] {1, timeSeriesLength, layerSize});
+        INDArray randSigBackwards = randSig.dup();
+        if (rnnDataFormat == RNNFormat.NCW){
+            reverseColumnsInPlace(randSigBackwards.slice(0));
+        }else{
+            reverseColumnsInPlace(randSigBackwards.slice(0).permute(1, 0));
+        }

        final Pair<Gradient, INDArray> backprop1 = forwardsLSTM.backpropGradient(randSig, LayerWorkspaceMgr.noWorkspaces());
        final Pair<Gradient, INDArray> backprop2 = bidirectionalLSTM.backpropGradient(randSig, LayerWorkspaceMgr.noWorkspaces());
@ -399,10 +447,16 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        final INDArray activation3 = bidirectionalLSTM.activate(sigb, false, LayerWorkspaceMgr.noWorkspaces()).slice(0);

        final INDArray activation3Reverse = activation3.dup();
-        reverseColumnsInPlace(activation3Reverse);
+        if (rnnDataFormat == RNNFormat.NCW){
+            reverseColumnsInPlace(activation3Reverse);
+        }
+        else{
+            reverseColumnsInPlace(activation3Reverse.permute(1, 0));
+        }

-        assertEquals(activation3Reverse, activation1);
        assertArrayEquals(activation3Reverse.shape(), activation1.shape());
+        assertEquals(activation3Reverse, activation1);
+

        //test backprop now
        final INDArray refBackGradientReccurrent =
@ -434,7 +488,12 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
        final INDArray refEpsilon = backprop1.getSecond().dup();
        final INDArray backEpsilon = backprop3.getSecond().dup();

-        reverseColumnsInPlace(refEpsilon.slice(0));
+        if (rnnDataFormat == RNNFormat.NCW) {
+            reverseColumnsInPlace(refEpsilon.slice(0));
+        }
+        else{
+            reverseColumnsInPlace(refEpsilon.slice(0).permute(1, 0));
+        }
        assertArrayEquals(backEpsilon.dup().data().asDouble(), refEpsilon.dup().data().asDouble(), 1e-6);

    }
@ -477,10 +536,10 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
                            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
                            .seed(12345).list()
                            .layer(0, new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder()
-                                            .gateActivationFunction(gateAfn).activation(Activation.TANH).nIn(2).nOut(2)
+                                            .gateActivationFunction(gateAfn).activation(Activation.TANH).nIn(2).nOut(2).dataFormat(rnnDataFormat)
                                            .build())
                            .layer(1, new org.deeplearning4j.nn.conf.layers.RnnOutputLayer.Builder()
-                                            .lossFunction(LossFunctions.LossFunction.MSE).nIn(2).nOut(2)
+                                            .lossFunction(LossFunctions.LossFunction.MSE).nIn(2).nOut(2).dataFormat(rnnDataFormat)
                                            .activation(Activation.TANH).build())
                            .build();

@ -492,7 +551,10 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {

            INDArray in = Nd4j.rand(new int[] {3, 2, 5});
            INDArray labels = Nd4j.rand(new int[] {3, 2, 5});
-
+            if (rnnDataFormat == RNNFormat.NWC){
+                in = in.permute(0, 2, 1);
+                labels = labels.permute(0, 2, 1);
+            }
            net.fit(in, labels);
        }
    }
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/MaskZeroLayerTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/MaskZeroLayerTest.java
@ -21,11 +21,14 @@ import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.layers.LSTM;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.optimize.api.TrainingListener;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
@ -36,9 +39,17 @@ import java.util.Collections;

 import static org.junit.Assert.assertEquals;

-
+@RunWith(Parameterized.class)
 public class MaskZeroLayerTest extends BaseDL4JTest {
+    private RNNFormat rnnDataFormat;

+    public MaskZeroLayerTest(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void activate() {

@ -57,7 +68,7 @@ public class MaskZeroLayerTest extends BaseDL4JTest {
                .activation(Activation.IDENTITY)
                .gateActivationFunction(Activation.IDENTITY)
                .nIn(2)
-                .nOut(1)
+                .nOut(1).dataFormat(rnnDataFormat)
                .build();
        NeuralNetConfiguration conf = new NeuralNetConfiguration();
        conf.setLayer(underlying);
@ -72,9 +83,14 @@ public class MaskZeroLayerTest extends BaseDL4JTest {

        MaskZeroLayer l = new MaskZeroLayer(lstm, maskingValue);
        INDArray input = Nd4j.create(Arrays.asList(ex1, ex2), new int[]{2, 2, 3});
+        if (rnnDataFormat == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        //WHEN
        INDArray out = l.activate(input, true, LayerWorkspaceMgr.noWorkspaces());
-
+        if (rnnDataFormat == RNNFormat.NWC){
+            out = out.permute(0, 2,1);
+        }
        //THEN output should only be incremented for the non-zero timesteps
        INDArray firstExampleOutput = out.get(NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.all());
        INDArray secondExampleOutput = out.get(NDArrayIndex.point(1), NDArrayIndex.all(), NDArrayIndex.all());
@ -94,7 +110,7 @@ public class MaskZeroLayerTest extends BaseDL4JTest {
        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
                .list()
                .layer(new org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer.Builder()
-                        .setMaskValue(0.0).setUnderlying(new LSTM.Builder().nIn(4).nOut(5).build()).build())
+                        .setMaskValue(0.0).setUnderlying(new LSTM.Builder().nIn(4).nOut(5).dataFormat(rnnDataFormat).build()).build())
                .build();
        MultiLayerNetwork net = new MultiLayerNetwork(conf);
        net.init();
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/RnnDataFormatTests.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/RnnDataFormatTests.java
@ -0,0 +1,394 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.deeplearning4j.nn.layers.recurrent;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import org.deeplearning4j.BaseDL4JTest;
+import org.deeplearning4j.TestUtils;
+import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
+import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.conf.layers.*;
+import org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM;
+import org.deeplearning4j.nn.conf.layers.GravesLSTM;
+import org.deeplearning4j.nn.conf.layers.LSTM;
+import org.deeplearning4j.nn.conf.layers.RnnOutputLayer;
+import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
+import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
+import org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer;
+import org.deeplearning4j.nn.gradient.Gradient;
+import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.nd4j.linalg.activations.Activation;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.primitives.Pair;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(Parameterized.class)
+@AllArgsConstructor
+public class RnnDataFormatTests extends BaseDL4JTest {
+
+    private boolean helpers;
+    private boolean lastTimeStep;
+    private boolean maskZeros;
+
+    @Parameterized.Parameters(name = "helpers={0},lastTimeStep={1},maskZero={2}")
+    public static List params(){
+        List<Object[]> ret = new ArrayList<>();
+        for (boolean helpers: new boolean[]{true, false})
+            for (boolean lastTimeStep: new boolean[]{true, false})
+                for (boolean maskZero: new boolean[]{true, false})
+                    ret.add(new Object[]{helpers, lastTimeStep, maskZero});
+        return ret;
+    }
+
+
+    @Test
+    public void testSimpleRnn() {
+        try {
+
+                    Nd4j.getRandom().setSeed(12345);
+                    Nd4j.getEnvironment().allowHelpers(helpers);
+                    String msg = "Helpers: " + helpers + ", lastTimeStep: " + lastTimeStep + ", maskZeros: " + maskZeros;
+                    System.out.println(" --- " + msg + " ---");
+
+                    INDArray inNCW = Nd4j.rand(DataType.FLOAT, 2, 3, 12);
+
+                    INDArray labelsNWC = (lastTimeStep) ?TestUtils.randomOneHot(2, 10): TestUtils.randomOneHot(2 * 12, 10).reshape(2, 12, 10);
+
+                    TestCase tc = TestCase.builder()
+                            .msg(msg)
+                            .net1(getSimpleRnnNet(RNNFormat.NCW, true, lastTimeStep, maskZeros))
+                            .net2(getSimpleRnnNet(RNNFormat.NCW, false, lastTimeStep, maskZeros))
+                            .net3(getSimpleRnnNet(RNNFormat.NWC, true, lastTimeStep, maskZeros))
+                            .net4(getSimpleRnnNet(RNNFormat.NWC, false, lastTimeStep, maskZeros))
+                            .inNCW(inNCW)
+                            .labelsNCW((lastTimeStep)? labelsNWC: labelsNWC.permute(0, 2, 1))
+                            .labelsNWC(labelsNWC)
+                            .testLayerIdx(1)
+                            .build();
+
+                    TestCase.testHelper(tc);
+
+
+        } finally {
+            Nd4j.getEnvironment().allowHelpers(true);
+        }
+    }
+
+    @Test
+    public void testLSTM() {
+        try {
+
+            Nd4j.getRandom().setSeed(12345);
+            Nd4j.getEnvironment().allowHelpers(helpers);
+            String msg = "Helpers: " + helpers + ", lastTimeStep: " + lastTimeStep + ", maskZeros: " + maskZeros;
+            System.out.println(" --- " + msg + " ---");
+
+            INDArray inNCW = Nd4j.rand(DataType.FLOAT, 2, 3, 12);
+
+            INDArray labelsNWC = (lastTimeStep) ?TestUtils.randomOneHot(2, 10): TestUtils.randomOneHot(2 * 12, 10).reshape(2, 12, 10);
+
+            TestCase tc = TestCase.builder()
+                    .msg(msg)
+                    .net1(getLstmNet(RNNFormat.NCW, true, lastTimeStep, maskZeros))
+                    .net2(getLstmNet(RNNFormat.NCW, false, lastTimeStep, maskZeros))
+                    .net3(getLstmNet(RNNFormat.NWC, true, lastTimeStep, maskZeros))
+                    .net4(getLstmNet(RNNFormat.NWC, false, lastTimeStep, maskZeros))
+                    .inNCW(inNCW)
+                    .labelsNCW((lastTimeStep)? labelsNWC: labelsNWC.permute(0, 2, 1))
+                    .labelsNWC(labelsNWC)
+                    .testLayerIdx(1)
+                    .build();
+
+            TestCase.testHelper(tc);
+
+
+        } finally {
+            Nd4j.getEnvironment().allowHelpers(true);
+        }
+    }
+
+
+    @Test
+    public void testGraveLSTM() {
+        try {
+
+            Nd4j.getRandom().setSeed(12345);
+            Nd4j.getEnvironment().allowHelpers(helpers);
+            String msg = "Helpers: " + helpers + ", lastTimeStep: " + lastTimeStep + ", maskZeros: " + maskZeros;
+            System.out.println(" --- " + msg + " ---");
+
+            INDArray inNCW = Nd4j.rand(DataType.FLOAT, 2, 3, 12);
+
+            INDArray labelsNWC = (lastTimeStep) ?TestUtils.randomOneHot(2, 10): TestUtils.randomOneHot(2 * 12, 10).reshape(2, 12, 10);
+
+            TestCase tc = TestCase.builder()
+                    .msg(msg)
+                    .net1(getGravesLstmNet(RNNFormat.NCW, true, lastTimeStep, maskZeros))
+                    .net2(getGravesLstmNet(RNNFormat.NCW, false, lastTimeStep, maskZeros))
+                    .net3(getGravesLstmNet(RNNFormat.NWC, true, lastTimeStep, maskZeros))
+                    .net4(getGravesLstmNet(RNNFormat.NWC, false, lastTimeStep, maskZeros))
+                    .inNCW(inNCW)
+                    .labelsNCW((lastTimeStep)? labelsNWC: labelsNWC.permute(0, 2, 1))
+                    .labelsNWC(labelsNWC)
+                    .testLayerIdx(1)
+                    .build();
+
+            TestCase.testHelper(tc);
+
+
+        } finally {
+            Nd4j.getEnvironment().allowHelpers(true);
+        }
+    }
+
+
+    @Test
+    public void testGraveBiLSTM() {
+        try {
+
+            Nd4j.getRandom().setSeed(12345);
+            Nd4j.getEnvironment().allowHelpers(helpers);
+            String msg = "Helpers: " + helpers + ", lastTimeStep: " + lastTimeStep + ", maskZeros: " + maskZeros;
+            System.out.println(" --- " + msg + " ---");
+
+            INDArray inNCW = Nd4j.rand(DataType.FLOAT, 2, 3, 12);
+
+            INDArray labelsNWC = (lastTimeStep) ?TestUtils.randomOneHot(2, 10): TestUtils.randomOneHot(2 * 12, 10).reshape(2, 12, 10);
+
+            TestCase tc = TestCase.builder()
+                    .msg(msg)
+                    .net1(getGravesBidirectionalLstmNet(RNNFormat.NCW, true, lastTimeStep, maskZeros))
+                    .net2(getGravesBidirectionalLstmNet(RNNFormat.NCW, false, lastTimeStep, maskZeros))
+                    .net3(getGravesBidirectionalLstmNet(RNNFormat.NWC, true, lastTimeStep, maskZeros))
+                    .net4(getGravesBidirectionalLstmNet(RNNFormat.NWC, false, lastTimeStep, maskZeros))
+                    .inNCW(inNCW)
+                    .labelsNCW((lastTimeStep)? labelsNWC: labelsNWC.permute(0, 2, 1))
+                    .labelsNWC(labelsNWC)
+                    .testLayerIdx(1)
+                    .build();
+
+            TestCase.testHelper(tc);
+
+
+        } finally {
+            Nd4j.getEnvironment().allowHelpers(true);
+        }
+    }
+
+
+    private MultiLayerNetwork getGravesBidirectionalLstmNet(RNNFormat format, boolean setOnLayerAlso, boolean lastTimeStep, boolean maskZeros) {
+        if (setOnLayerAlso) {
+            return getNetWithLayer(new GravesBidirectionalLSTM.Builder().nOut(3)
+                    .dataFormat(format).build(), format, lastTimeStep, maskZeros);
+        } else {
+            return getNetWithLayer(new  GravesBidirectionalLSTM.Builder().nOut(3).build(), format, lastTimeStep, maskZeros);
+        }
+    }
+    private MultiLayerNetwork getGravesLstmNet(RNNFormat format, boolean setOnLayerAlso, boolean lastTimeStep, boolean maskZeros) {
+        if (setOnLayerAlso) {
+            return getNetWithLayer(new GravesLSTM.Builder().nOut(3)
+                    .dataFormat(format).build(), format, lastTimeStep, maskZeros);
+        } else {
+            return getNetWithLayer(new GravesLSTM.Builder().nOut(3).build(), format, lastTimeStep, maskZeros);
+        }
+    }
+
+    private MultiLayerNetwork getLstmNet(RNNFormat format, boolean setOnLayerAlso, boolean lastTimeStep, boolean maskZeros) {
+        if (setOnLayerAlso) {
+            return getNetWithLayer(new LSTM.Builder().nOut(3)
+                    .dataFormat(format).build(), format, lastTimeStep, maskZeros);
+        } else {
+            return getNetWithLayer(new LSTM.Builder().nOut(3).build(), format, lastTimeStep, maskZeros);
+        }
+    }
+
+    private MultiLayerNetwork getSimpleRnnNet(RNNFormat format, boolean setOnLayerAlso, boolean lastTimeStep, boolean maskZeros) {
+        if (setOnLayerAlso) {
+            return getNetWithLayer(new SimpleRnn.Builder().nOut(3)
+                    .dataFormat(format).build(), format, lastTimeStep, maskZeros);
+        } else {
+            return getNetWithLayer(new SimpleRnn.Builder().nOut(3).build(), format, lastTimeStep, maskZeros);
+        }
+    }
+    private MultiLayerNetwork getNetWithLayer(Layer layer, RNNFormat format, boolean lastTimeStep, boolean maskZeros) {
+        if (maskZeros){
+            layer = new MaskZeroLayer.Builder().setMaskValue(0.).setUnderlying(layer).build();
+        }
+        if(lastTimeStep){
+            layer = new LastTimeStep(layer);
+        }
+        NeuralNetConfiguration.ListBuilder builder = new NeuralNetConfiguration.Builder()
+                .seed(12345)
+                .list()
+                .layer(new LSTM.Builder()
+                        .nIn(3)
+                        .activation(Activation.TANH)
+                        .dataFormat(format)
+                        .nOut(3)
+                        .helperAllowFallback(false)
+                        .build())
+                .layer(layer)
+                .layer(
+                        (lastTimeStep)?new OutputLayer.Builder().activation(Activation.SOFTMAX).nOut(10).build():
+        new RnnOutputLayer.Builder().activation(Activation.SOFTMAX).nOut(10).dataFormat(format).build()
+                )
+                .setInputType(InputType.recurrent(3, 12, format));
+
+        MultiLayerNetwork net = new MultiLayerNetwork(builder.build());
+        net.init();
+        return net;
+    }
+
+    @AllArgsConstructor
+    @Data
+    @NoArgsConstructor
+    @Builder
+    private static class TestCase {
+        private String msg;
+        private MultiLayerNetwork net1;
+        private MultiLayerNetwork net2;
+        private MultiLayerNetwork net3;
+        private MultiLayerNetwork net4;
+        private INDArray inNCW;
+        private INDArray labelsNCW;
+        private INDArray labelsNWC;
+        private int testLayerIdx;
+        private boolean nwcOutput;
+
+        public static void testHelper(TestCase tc) {
+
+            tc.net2.params().assign(tc.net1.params());
+            tc.net3.params().assign(tc.net1.params());
+            tc.net4.params().assign(tc.net1.params());
+
+            INDArray inNCW = tc.inNCW;
+            INDArray inNWC = tc.inNCW.permute(0, 2, 1).dup();
+
+            INDArray l0_1 = tc.net1.feedForward(inNCW).get(tc.testLayerIdx + 1);
+            INDArray l0_2 = tc.net2.feedForward(inNCW).get(tc.testLayerIdx + 1);
+            INDArray l0_3 = tc.net3.feedForward(inNWC).get(tc.testLayerIdx + 1);
+            INDArray l0_4 = tc.net4.feedForward(inNWC).get(tc.testLayerIdx + 1);
+
+            boolean rank3Out = tc.labelsNCW.rank() == 3;
+            assertEquals(tc.msg, l0_1, l0_2);
+            if (rank3Out){
+                assertEquals(tc.msg, l0_1, l0_3.permute(0, 2, 1));
+                assertEquals(tc.msg, l0_1, l0_4.permute(0, 2, 1));
+            }
+            else{
+                assertEquals(tc.msg, l0_1, l0_3);
+                assertEquals(tc.msg, l0_1, l0_4);
+            }
+            INDArray out1 = tc.net1.output(inNCW);
+            INDArray out2 = tc.net2.output(inNCW);
+            INDArray out3 = tc.net3.output(inNWC);
+            INDArray out4 = tc.net4.output(inNWC);
+
+            assertEquals(tc.msg, out1, out2);
+            if (rank3Out){
+                assertEquals(tc.msg, out1, out3.permute(0, 2, 1));      //NWC to NCW
+                assertEquals(tc.msg, out1, out4.permute(0, 2, 1));
+            }
+            else{
+                assertEquals(tc.msg, out1, out3);      //NWC to NCW
+                assertEquals(tc.msg, out1, out4);
+            }
+
+
+            //Test backprop
+            Pair<Gradient, INDArray> p1 = tc.net1.calculateGradients(inNCW, tc.labelsNCW, null, null);
+            Pair<Gradient, INDArray> p2 = tc.net2.calculateGradients(inNCW, tc.labelsNCW, null, null);
+            Pair<Gradient, INDArray> p3 = tc.net3.calculateGradients(inNWC, tc.labelsNWC, null, null);
+            Pair<Gradient, INDArray> p4 = tc.net4.calculateGradients(inNWC, tc.labelsNWC, null, null);
+
+            //Inpput gradients
+            assertEquals(tc.msg, p1.getSecond(), p2.getSecond());
+
+            assertEquals(tc.msg, p1.getSecond(), p3.getSecond().permute(0, 2, 1));  //Input gradients for NWC input are also in NWC format
+            assertEquals(tc.msg, p1.getSecond(), p4.getSecond().permute(0, 2, 1));
+
+
+            List<String> diff12 = differentGrads(p1.getFirst(), p2.getFirst());
+            List<String> diff13 = differentGrads(p1.getFirst(), p3.getFirst());
+            List<String> diff14 = differentGrads(p1.getFirst(), p4.getFirst());
+            assertEquals(tc.msg + " " + diff12, 0, diff12.size());
+            assertEquals(tc.msg + " " + diff13, 0, diff13.size());
+            assertEquals(tc.msg + " " + diff14, 0, diff14.size());
+
+            assertEquals(tc.msg, p1.getFirst().gradientForVariable(), p2.getFirst().gradientForVariable());
+            assertEquals(tc.msg, p1.getFirst().gradientForVariable(), p3.getFirst().gradientForVariable());
+            assertEquals(tc.msg, p1.getFirst().gradientForVariable(), p4.getFirst().gradientForVariable());
+
+            tc.net1.fit(inNCW, tc.labelsNCW);
+            tc.net2.fit(inNCW, tc.labelsNCW);
+            tc.net3.fit(inNWC, tc.labelsNWC);
+            tc.net4.fit(inNWC, tc.labelsNWC);
+
+            assertEquals(tc.msg, tc.net1.params(), tc.net2.params());
+            assertEquals(tc.msg, tc.net1.params(), tc.net3.params());
+            assertEquals(tc.msg, tc.net1.params(), tc.net4.params());
+
+            //Test serialization
+            MultiLayerNetwork net1a = TestUtils.testModelSerialization(tc.net1);
+            MultiLayerNetwork net2a = TestUtils.testModelSerialization(tc.net2);
+            MultiLayerNetwork net3a = TestUtils.testModelSerialization(tc.net3);
+            MultiLayerNetwork net4a = TestUtils.testModelSerialization(tc.net4);
+
+            out1 = tc.net1.output(inNCW);
+            assertEquals(tc.msg, out1, net1a.output(inNCW));
+            assertEquals(tc.msg, out1, net2a.output(inNCW));
+
+            if (rank3Out) {
+                assertEquals(tc.msg, out1, net3a.output(inNWC).permute(0, 2, 1));   //NWC to NCW
+                assertEquals(tc.msg, out1, net4a.output(inNWC).permute(0, 2, 1));
+            }
+            else{
+                assertEquals(tc.msg, out1, net3a.output(inNWC));   //NWC to NCW
+                assertEquals(tc.msg, out1, net4a.output(inNWC));
+            }
+        }
+
+    }
+    private static List<String> differentGrads(Gradient g1, Gradient g2){
+        List<String> differs = new ArrayList<>();
+        Map<String,INDArray> m1 = g1.gradientForVariable();
+        Map<String,INDArray> m2 = g2.gradientForVariable();
+        for(String s : m1.keySet()){
+            INDArray a1 = m1.get(s);
+            INDArray a2 = m2.get(s);
+            if(!a1.equals(a2)){
+                differs.add(s);
+            }
+        }
+        return differs;
+    }
+}
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestLastTimeStepLayer.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestLastTimeStepLayer.java
@ -21,6 +21,7 @@ import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.DenseLayer;
 import org.deeplearning4j.nn.conf.layers.LSTM;
@ -29,6 +30,8 @@ import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
 import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.NDArrayIndex;
@ -42,14 +45,25 @@ import static org.nd4j.linalg.activations.Activation.IDENTITY;
 import static org.nd4j.linalg.activations.Activation.TANH;
 import static org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction.MSE;

+
+@RunWith(Parameterized.class)
 public class TestLastTimeStepLayer extends BaseDL4JTest {
+    private RNNFormat rnnDataFormat;
+
+    public TestLastTimeStepLayer(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters(name="{0}")
+    public static Object[] params(){
+        return RNNFormat.values();
+    }

    @Test
    public void testLastTimeStepVertex() {

        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().graphBuilder().addInputs("in")
                .addLayer("lastTS", new LastTimeStep(new SimpleRnn.Builder()
-                        .nIn(5).nOut(6).build()), "in")
+                        .nIn(5).nOut(6).dataFormat(rnnDataFormat).build()), "in")
                .setOutputs("lastTS")
                .build();

@ -59,9 +73,22 @@ public class TestLastTimeStepLayer extends BaseDL4JTest {
        //First: test without input mask array
        Nd4j.getRandom().setSeed(12345);
        Layer l = graph.getLayer("lastTS");
-        INDArray in = Nd4j.rand(new int[]{3, 5, 6});
+        INDArray in;
+        if (rnnDataFormat == RNNFormat.NCW){
+            in = Nd4j.rand(3, 5, 6);
+        }
+        else{
+            in = Nd4j.rand(3, 6, 5);
+        }
        INDArray outUnderlying = ((LastTimeStepLayer)l).getUnderlying().activate(in, false, LayerWorkspaceMgr.noWorkspaces());
-        INDArray expOut = outUnderlying.get(NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.point(5));
+        INDArray expOut;
+        if (rnnDataFormat == RNNFormat.NCW){
+            expOut = outUnderlying.get(NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.point(5));
+        }
+        else{
+            expOut = outUnderlying.get(NDArrayIndex.all(), NDArrayIndex.point(5), NDArrayIndex.all());
+        }
+


        //Forward pass:
@ -76,9 +103,17 @@ public class TestLastTimeStepLayer extends BaseDL4JTest {
        graph.setLayerMaskArrays(new INDArray[]{inMask}, null);

        expOut = Nd4j.zeros(3, 6);
-        expOut.putRow(0, outUnderlying.get(NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(2)));
-        expOut.putRow(1, outUnderlying.get(NDArrayIndex.point(1), NDArrayIndex.all(), NDArrayIndex.point(3)));
-        expOut.putRow(2, outUnderlying.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(4)));
+        if (rnnDataFormat == RNNFormat.NCW){
+            expOut.putRow(0, outUnderlying.get(NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(2)));
+            expOut.putRow(1, outUnderlying.get(NDArrayIndex.point(1), NDArrayIndex.all(), NDArrayIndex.point(3)));
+            expOut.putRow(2, outUnderlying.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(4)));
+        }
+        else{
+            expOut.putRow(0, outUnderlying.get(NDArrayIndex.point(0), NDArrayIndex.point(2), NDArrayIndex.all()));
+            expOut.putRow(1, outUnderlying.get(NDArrayIndex.point(1), NDArrayIndex.point(3), NDArrayIndex.all()));
+            expOut.putRow(2, outUnderlying.get(NDArrayIndex.point(2), NDArrayIndex.point(4), NDArrayIndex.all()));
+        }
+

        outFwd = l.activate(in, false, LayerWorkspaceMgr.noWorkspaces());
        assertEquals(expOut, outFwd);
@ -97,9 +132,9 @@ public class TestLastTimeStepLayer extends BaseDL4JTest {
                .seed(1234)
                .graphBuilder()
                .addInputs("in")
-                .setInputTypes(InputType.recurrent(1))
+                .setInputTypes(InputType.recurrent(1, rnnDataFormat))
                .addLayer("RNN", new LastTimeStep(new LSTM.Builder()
-                        .nOut(10)
+                        .nOut(10).dataFormat(rnnDataFormat)
                        .build()), "in")
                .addLayer("dense", new DenseLayer.Builder()
                        .nOut(10)
@ -120,7 +155,9 @@ public class TestLastTimeStepLayer extends BaseDL4JTest {
        INDArray fm2 = Nd4j.zeros(1,24);
        INDArray fm3 = Nd4j.zeros(1,24);
        fm3.get(NDArrayIndex.point(0), NDArrayIndex.interval(0,5)).assign(1);
-
+        if (rnnDataFormat == RNNFormat.NWC){
+            f = f.permute(0, 2, 1);
+        }
        INDArray[] out1 = cg.output(false, new INDArray[]{f}, new INDArray[]{fm1});
        try {
            cg.output(false, new INDArray[]{f}, new INDArray[]{fm2});
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestRnnLayers.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestRnnLayers.java
@ -20,6 +20,7 @@ import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.dropout.TestDropout;
 import org.deeplearning4j.nn.conf.layers.GravesLSTM;
 import org.deeplearning4j.nn.conf.layers.LSTM;
@ -31,6 +32,8 @@ import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.weights.WeightInit;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -46,8 +49,18 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;

+@RunWith(Parameterized.class)
 public class TestRnnLayers extends BaseDL4JTest {

+    private RNNFormat rnnDataFormat;
+
+    public TestRnnLayers(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void testTimeStepIs3Dimensional() {

@ -58,8 +71,8 @@ public class TestRnnLayers extends BaseDL4JTest {
                .updater(new NoOp())
                .weightInit(WeightInit.XAVIER)
                .list()
-                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(3).build())
-                .layer(new LSTM.Builder().nIn(3).nOut(5).build())
+                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(3).dataFormat(rnnDataFormat).build())
+                .layer(new LSTM.Builder().nIn(3).nOut(5).dataFormat(rnnDataFormat).build())
                .layer(new RnnOutputLayer.Builder().nOut(nOut).activation(Activation.SOFTMAX).build())
                .build();

@ -70,9 +83,9 @@ public class TestRnnLayers extends BaseDL4JTest {
        org.deeplearning4j.nn.layers.recurrent.SimpleRnn simpleRnn =
                (org.deeplearning4j.nn.layers.recurrent.SimpleRnn) net.getLayer(0);

-        INDArray rnnInput3d = Nd4j.create(10, 12, 1);
+        INDArray rnnInput3d = (rnnDataFormat==RNNFormat.NCW)?Nd4j.create(10,12, 1):Nd4j.create(10, 1, 12);
        INDArray simpleOut = simpleRnn.rnnTimeStep(rnnInput3d, LayerWorkspaceMgr.noWorkspaces());
-        assertTrue(Arrays.equals(simpleOut.shape(), new long[] {10, 3, 1}));
+        assertTrue(Arrays.equals(simpleOut.shape(), (rnnDataFormat==RNNFormat.NCW)?new long[] {10, 3, 1}:new long[]{10, 1, 3}));

        INDArray rnnInput2d = Nd4j.create(10, 12);
        try {
@ -84,9 +97,9 @@ public class TestRnnLayers extends BaseDL4JTest {
        org.deeplearning4j.nn.layers.recurrent.LSTM lstm =
                (org.deeplearning4j.nn.layers.recurrent.LSTM) net.getLayer(1);

-        INDArray lstmInput3d = Nd4j.create(10, 3, 1);
+        INDArray lstmInput3d = (rnnDataFormat==RNNFormat.NCW)?Nd4j.create(10, 3, 1):Nd4j.create(10, 1, 3);
        INDArray lstmOut = lstm.rnnTimeStep(lstmInput3d, LayerWorkspaceMgr.noWorkspaces());
-        assertTrue(Arrays.equals(lstmOut.shape(), new long[] {10, 5, 1}));
+        assertTrue(Arrays.equals(lstmOut.shape(), (rnnDataFormat==RNNFormat.NCW)?new long[] {10, 5, 1}:new long[]{10, 1, 5}));

        INDArray lstmInput2d = Nd4j.create(10, 3);
        try {
@ -112,19 +125,19 @@ public class TestRnnLayers extends BaseDL4JTest {
            TestDropout.CustomDropout cd = new TestDropout.CustomDropout();
            switch (s){
                case "graves":
-                    layer = new GravesLSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD = new GravesLSTM.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD2 = new GravesLSTM.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).build();
+                    layer = new GravesLSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD = new GravesLSTM.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD2 = new GravesLSTM.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
                    break;
                case "lstm":
-                    layer = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD2 = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).build();
+                    layer = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD2 = new org.deeplearning4j.nn.conf.layers.LSTM.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
                    break;
                case "simple":
-                    layer = new SimpleRnn.Builder().activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD = new SimpleRnn.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).build();
-                    layerD2 = new SimpleRnn.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).build();
+                    layer = new SimpleRnn.Builder().activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD = new SimpleRnn.Builder().dropOut(0.5).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
+                    layerD2 = new SimpleRnn.Builder().dropOut(cd).activation(Activation.TANH).nIn(10).nOut(10).dataFormat(rnnDataFormat).build();
                    break;
                default:
                    throw new RuntimeException(s);
@ -134,21 +147,21 @@ public class TestRnnLayers extends BaseDL4JTest {
                    .seed(12345)
                    .list()
                    .layer(layer)
-                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).build())
+                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
                    .build();

            MultiLayerConfiguration confD = new NeuralNetConfiguration.Builder()
                    .seed(12345)
                    .list()
                    .layer(layerD)
-                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).build())
+                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
                    .build();

            MultiLayerConfiguration confD2 = new NeuralNetConfiguration.Builder()
                    .seed(12345)
                    .list()
                    .layer(layerD2)
-                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).build())
+                    .layer(new RnnOutputLayer.Builder().activation(Activation.TANH).lossFunction(LossFunctions.LossFunction.MSE).nIn(10).nOut(10).dataFormat(rnnDataFormat).build())
                    .build();

            MultiLayerNetwork net = new MultiLayerNetwork(conf);
@ -178,7 +191,6 @@ public class TestRnnLayers extends BaseDL4JTest {
            assertNotEquals(s, out2, out2D);

            INDArray l = TestUtils.randomOneHotTimeSeries(3, 10, 10, 12345);
-
            net.fit(f.dup(), l);
            netD.fit(f.dup(), l);
            assertNotEquals(s, net.params(), netD.params());
@ -209,10 +221,10 @@ public class TestRnnLayers extends BaseDL4JTest {

            switch (i){
                case 0:
-                    lb.layer(new RnnOutputLayer.Builder().activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).nIn(5).nOut(5).build());
+                    lb.layer(new RnnOutputLayer.Builder().activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).nIn(5).nOut(5).dataFormat(rnnDataFormat).build());
                    break;
                case 1:
-                    lb.layer(new RnnLossLayer.Builder().activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build());
+                    lb.layer(new RnnLossLayer.Builder().activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).dataFormat(rnnDataFormat).build());
                    break;
                default:
                    throw new RuntimeException();
@ -224,13 +236,16 @@ public class TestRnnLayers extends BaseDL4JTest {

            INDArray in = Nd4j.rand(DataType.FLOAT, 3, 5, 5);
            INDArray l = TestUtils.randomOneHotTimeSeries(3, 5, 10);
-
+            if (rnnDataFormat == RNNFormat.NWC){
+                l = l.permute(0, 2, 1);
+            }
            try{
                net.fit(in,l);
            } catch (Throwable t){
                String msg = t.getMessage();
                if(msg == null)
                    t.printStackTrace();
+                System.out.println(i);
                assertTrue(msg, msg != null && msg.contains("sequence length") && msg.contains("input") && msg.contains("label"));
            }

--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestSimpleRnn.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestSimpleRnn.java
@ -20,10 +20,13 @@ import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.weights.WeightInit;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -36,8 +39,18 @@ import static org.nd4j.linalg.indexing.NDArrayIndex.all;
 import static org.nd4j.linalg.indexing.NDArrayIndex.interval;
 import static org.nd4j.linalg.indexing.NDArrayIndex.point;

+@RunWith(Parameterized.class)
 public class TestSimpleRnn extends BaseDL4JTest {

+    private RNNFormat rnnDataFormat;
+
+    public TestSimpleRnn(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void testSimpleRnn(){
        Nd4j.getRandom().setSeed(12345);
@ -46,7 +59,15 @@ public class TestSimpleRnn extends BaseDL4JTest {
        int nIn = 5;
        int layerSize = 6;
        int tsLength = 7;
-        INDArray in = Nd4j.rand(DataType.FLOAT, new int[]{m, nIn, tsLength});
+        INDArray in;
+        if (rnnDataFormat == RNNFormat.NCW){
+            in = Nd4j.rand(DataType.FLOAT, new int[]{m, nIn, tsLength});
+        }
+        else{
+            in = Nd4j.rand(DataType.FLOAT, new int[]{m, tsLength, nIn});
+        }
+
+
 //        in.get(all(), all(), interval(1,tsLength)).assign(0);

        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
@ -54,7 +75,7 @@ public class TestSimpleRnn extends BaseDL4JTest {
                .weightInit(WeightInit.XAVIER)
                .activation(Activation.TANH)
                .list()
-                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(layerSize).build())
+                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(layerSize).dataFormat(rnnDataFormat).build())
                .build();

        MultiLayerNetwork net = new MultiLayerNetwork(conf);
@ -68,7 +89,13 @@ public class TestSimpleRnn extends BaseDL4JTest {

        INDArray outLast = null;
        for( int i=0; i<tsLength; i++ ){
-            INDArray inCurrent = in.get(all(), all(), point(i));
+            INDArray inCurrent;
+            if (rnnDataFormat == RNNFormat.NCW){
+                inCurrent = in.get(all(), all(), point(i));
+            }
+            else{
+                inCurrent = in.get(all(), point(i), all());
+            }

            INDArray outExpCurrent = inCurrent.mmul(w);
            if(outLast != null){
@ -79,7 +106,13 @@ public class TestSimpleRnn extends BaseDL4JTest {

            Transforms.tanh(outExpCurrent, false);

-            INDArray outActCurrent = out.get(all(), all(), point(i));
+            INDArray outActCurrent;
+            if (rnnDataFormat == RNNFormat.NCW){
+                outActCurrent = out.get(all(), all(), point(i));
+            }
+            else{
+                outActCurrent = out.get(all(), point(i), all());
+            }
            assertEquals(String.valueOf(i), outExpCurrent, outActCurrent);

            outLast = outExpCurrent;
@ -100,7 +133,7 @@ public class TestSimpleRnn extends BaseDL4JTest {
                .weightInit(WeightInit.XAVIER)
                .activation(Activation.TANH)
                .list()
-                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(layerSize)
+                .layer(new SimpleRnn.Builder().nIn(nIn).nOut(layerSize).dataFormat(rnnDataFormat)
                        .biasInit(100)
                        .build())
                .build();
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestTimeDistributed.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/TestTimeDistributed.java
@ -4,6 +4,7 @@ import org.deeplearning4j.BaseDL4JTest;
 import org.deeplearning4j.TestUtils;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.WorkspaceMode;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.DenseLayer;
@ -12,6 +13,8 @@ import org.deeplearning4j.nn.conf.layers.RnnOutputLayer;
 import org.deeplearning4j.nn.conf.layers.recurrent.TimeDistributed;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -22,8 +25,18 @@ import org.nd4j.linalg.lossfunctions.LossFunctions;

 import static org.junit.Assert.assertEquals;

+@RunWith(Parameterized.class)
 public class TestTimeDistributed extends BaseDL4JTest {

+    private RNNFormat rnnDataFormat;
+
+    public TestTimeDistributed(RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
+    @Parameterized.Parameters
+    public static Object[] params(){
+        return RNNFormat.values();
+    }
    @Test
    public void testTimeDistributed(){
        for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) {
@ -34,11 +47,11 @@ public class TestTimeDistributed extends BaseDL4JTest {
                    .seed(12345)
                    .updater(new Adam(0.1))
                    .list()
-                    .layer(new LSTM.Builder().nIn(3).nOut(3).build())
+                    .layer(new LSTM.Builder().nIn(3).nOut(3).dataFormat(rnnDataFormat).build())
                    .layer(new DenseLayer.Builder().nIn(3).nOut(3).activation(Activation.TANH).build())
-                    .layer(new RnnOutputLayer.Builder().nIn(3).nOut(3).activation(Activation.SOFTMAX)
+                    .layer(new RnnOutputLayer.Builder().nIn(3).nOut(3).activation(Activation.SOFTMAX).dataFormat(rnnDataFormat)
                            .lossFunction(LossFunctions.LossFunction.MCXENT).build())
-                    .setInputType(InputType.recurrent(3))
+                    .setInputType(InputType.recurrent(3, rnnDataFormat))
                    .build();

            MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder()
@ -47,11 +60,11 @@ public class TestTimeDistributed extends BaseDL4JTest {
                    .seed(12345)
                    .updater(new Adam(0.1))
                    .list()
-                    .layer(new LSTM.Builder().nIn(3).nOut(3).build())
-                    .layer(new TimeDistributed(new DenseLayer.Builder().nIn(3).nOut(3).activation(Activation.TANH).build(), 2))
-                    .layer(new RnnOutputLayer.Builder().nIn(3).nOut(3).activation(Activation.SOFTMAX)
+                    .layer(new LSTM.Builder().nIn(3).nOut(3).dataFormat(rnnDataFormat).build())
+                    .layer(new TimeDistributed(new DenseLayer.Builder().nIn(3).nOut(3).activation(Activation.TANH).build(), rnnDataFormat))
+                    .layer(new RnnOutputLayer.Builder().nIn(3).nOut(3).activation(Activation.SOFTMAX).dataFormat(rnnDataFormat)
                            .lossFunction(LossFunctions.LossFunction.MCXENT).build())
-                    .setInputType(InputType.recurrent(3))
+                    .setInputType(InputType.recurrent(3, rnnDataFormat))
                    .build();

            MultiLayerNetwork net1 = new MultiLayerNetwork(conf1);
@ -62,13 +75,21 @@ public class TestTimeDistributed extends BaseDL4JTest {
            for( int mb : new int[]{1, 5}) {
                for(char inLabelOrder : new char[]{'c', 'f'}) {
                    INDArray in = Nd4j.rand(DataType.FLOAT, mb, 3, 5).dup(inLabelOrder);
-
+                    if (rnnDataFormat == RNNFormat.NWC){
+                        in = in.permute(0, 2, 1);
+                    }
                    INDArray out1 = net1.output(in);
                    INDArray out2 = net2.output(in);
-
                    assertEquals(out1, out2);

-                    INDArray labels = TestUtils.randomOneHotTimeSeries(mb, 3, 5).dup(inLabelOrder);
+                    INDArray labels ;
+                    if (rnnDataFormat == RNNFormat.NCW) {
+                        labels = TestUtils.randomOneHotTimeSeries(mb, 3, 5).dup(inLabelOrder);
+                    }else{
+                        labels = TestUtils.randomOneHotTimeSeries(mb, 5, 3).dup(inLabelOrder);
+                    }
+
+

                    DataSet ds = new DataSet(in, labels);
                    net1.fit(ds);
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java
@ -22,6 +22,7 @@ import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.ArrayUtils;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Convolution1DLayer;
 import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
@ -160,8 +161,8 @@ public class KerasConvolution1D extends KerasConvolution {
    public InputPreProcessor getInputPreprocessor(InputType... inputType) throws InvalidKerasConfigurationException {
        if (inputType.length > 1)
            throw new InvalidKerasConfigurationException(
-                    "Keras LSTM layer accepts only one input (received " + inputType.length + ")");
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], layerName);
+                    "Keras Conv1D layer accepts only one input (received " + inputType.length + ")");
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], RNNFormat.NCW,layerName);
    }


--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
@ -22,11 +22,9 @@ import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
-import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
-import org.deeplearning4j.nn.conf.layers.LSTM;
-import org.deeplearning4j.nn.conf.layers.Layer;
+import org.deeplearning4j.nn.conf.layers.*;
 import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
 import org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer;
 import org.deeplearning4j.nn.conf.layers.wrapper.BaseWrapperLayer;
@ -37,6 +35,7 @@ import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.LSTMParamInitializer;
 import org.deeplearning4j.nn.weights.IWeightInit;
+import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.linalg.activations.IActivation;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
@ -266,7 +265,8 @@ public class KerasLSTM extends KerasLayer {
            throw new InvalidKerasConfigurationException("Keras LSTM layer accepts only one single input" +
                    "or three (input to LSTM and two states tensors, but " +
                    "received " + inputType.length + ".");
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], layerName);
+        RNNFormat f = TimeSeriesUtils.getFormatFromRnnLayer(layer);
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], f,layerName);
    }

    /**
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
@ -21,7 +21,9 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
+import org.deeplearning4j.nn.conf.layers.BaseRecurrentLayer;
 import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
 import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
 import org.deeplearning4j.nn.conf.layers.Layer;
@ -36,6 +38,7 @@ import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.SimpleRnnParamInitializer;
 import org.deeplearning4j.nn.weights.IWeightInit;
+import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.primitives.Pair;

@ -227,7 +230,8 @@ public class KerasSimpleRnn extends KerasLayer {
            throw new InvalidKerasConfigurationException(
                    "Keras SimpleRnn layer accepts only one input (received " + inputType.length + ")");

-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], layerName);
+        RNNFormat f = TimeSeriesUtils.getFormatFromRnnLayer(layer);
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], f, layerName);
    }

    /**
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/wrappers/KerasBidirectional.java
@ -218,7 +218,7 @@ public class KerasBidirectional extends KerasLayer {
        if (inputType.length > 1)
            throw new InvalidKerasConfigurationException(
                    "Keras Bidirectional layer accepts only one input (received " + inputType.length + ")");
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], layerName);
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType[0], ((Bidirectional)layer).getRNNDataFormat(), layerName);
    }

    /**
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/api/layers/RecurrentLayer.java
@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.api.layers;

 import org.deeplearning4j.nn.api.Layer;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.primitives.Pair;
@ -98,4 +99,5 @@ public interface RecurrentLayer extends Layer {
     */
    Pair<Gradient, INDArray> tbpttBackpropGradient(INDArray epsilon, int tbpttBackLength, LayerWorkspaceMgr workspaceMgr);

+
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/RNNFormat.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/RNNFormat.java
@ -0,0 +1,29 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+
+package org.deeplearning4j.nn.conf;
+
+/**
+ * NCW = "channels first" - arrays of shape [minibatch, channels, width]<br>
+ * NWC = "channels last" - arrays of shape [minibatch, width, channels]<br>
+ * "width" corresponds to sequence length and "channels" corresponds to sequence item size.
+ */
+
+public enum RNNFormat {
+    NCW,
+    NWC
+}
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/inputs/InputType.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/inputs/InputType.java
@ -20,6 +20,7 @@ import lombok.Data;
 import lombok.EqualsAndHashCode;
 import lombok.Getter;
 import lombok.NoArgsConstructor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.CNN2DFormat;
 import org.deeplearning4j.nn.conf.layers.Convolution3D;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -111,9 +112,16 @@ public abstract class InputType implements Serializable {
     * @return InputTypeRecurrent
     */
    public static InputType recurrent(long size, long timeSeriesLength) {
-        return new InputTypeRecurrent(size, timeSeriesLength);
+        return new InputTypeRecurrent(size, timeSeriesLength, RNNFormat.NCW);
    }

+    public static InputType recurrent(long size, RNNFormat format){
+        return new InputTypeRecurrent(size, format);
+    }
+
+    public static InputType recurrent(long size, long timeSeriesLength, RNNFormat format){
+        return new InputTypeRecurrent(size, timeSeriesLength, format);
+    }
    /**
     * Input type for convolutional (CNN) data, that is 4d with shape [miniBatchSize, channels, height, width].
     * For CNN data that has been flattened, use {@link #convolutionalFlat(long, long, long)}
@ -216,14 +224,23 @@ public abstract class InputType implements Serializable {
    public static class InputTypeRecurrent extends InputType {
        private long size;
        private long timeSeriesLength;
-
+        private RNNFormat format = RNNFormat.NCW;
        public InputTypeRecurrent(long size) {
            this(size, -1);
        }
+        public InputTypeRecurrent(long size, long timeSeriesLength){
+            this(size, timeSeriesLength, RNNFormat.NCW);
+        }

-        public InputTypeRecurrent(@JsonProperty("size") long size, @JsonProperty("timeSeriesLength") long timeSeriesLength) {
+        public  InputTypeRecurrent(long size, RNNFormat format){
+            this(size, -1, format);
+        }
+        public InputTypeRecurrent(@JsonProperty("size") long size,
+                                  @JsonProperty("timeSeriesLength") long timeSeriesLength,
+                                  @JsonProperty("format") RNNFormat format) {
            this.size = size;
            this.timeSeriesLength = timeSeriesLength;
+            this.format = format;
        }

        @Override
@ -234,9 +251,9 @@ public abstract class InputType implements Serializable {
        @Override
        public String toString() {
            if (timeSeriesLength > 0) {
-                return "InputTypeRecurrent(" + size + ",timeSeriesLength=" + timeSeriesLength + ")";
+                return "InputTypeRecurrent(" + size + ",timeSeriesLength=" + timeSeriesLength + ",format=" + format + ")";
            } else {
-                return "InputTypeRecurrent(" + size + ")";
+                return "InputTypeRecurrent(" + size + ",format=" + format + ")";
            }
        }

@ -251,8 +268,23 @@ public abstract class InputType implements Serializable {

        @Override
        public long[] getShape(boolean includeBatchDim) {
-            if(includeBatchDim) return new long[]{-1, size, timeSeriesLength};
-            else return new long[]{size, timeSeriesLength};
+            if (includeBatchDim){
+                if (format == RNNFormat.NCW){
+                    return new long[]{-1, size, timeSeriesLength};
+                }
+                else{
+                    return new long[]{-1, timeSeriesLength, size};
+                }
+
+            }
+            else{
+                if (format == RNNFormat.NCW){
+                    return new long[]{size, timeSeriesLength};
+                }
+                else{
+                    return new long[]{timeSeriesLength, size};
+                }
+            }
        }
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseRecurrentLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BaseRecurrentLayer.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.conf.layers;
 import lombok.*;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.weights.IWeightInit;
@ -35,10 +36,12 @@ import java.util.List;
 public abstract class BaseRecurrentLayer extends FeedForwardLayer {

    protected IWeightInit weightInitFnRecurrent;
+    protected RNNFormat rnnDataFormat = RNNFormat.NCW;

    protected BaseRecurrentLayer(Builder builder) {
        super(builder);
        this.weightInitFnRecurrent = builder.weightInitFnRecurrent;
+        this.rnnDataFormat = builder.rnnDataFormat;
    }

    @Override
@ -51,7 +54,7 @@ public abstract class BaseRecurrentLayer extends FeedForwardLayer {

        InputType.InputTypeRecurrent itr = (InputType.InputTypeRecurrent) inputType;

-        return InputType.recurrent(nOut, itr.getTimeSeriesLength());
+        return InputType.recurrent(nOut, itr.getTimeSeriesLength(), itr.getFormat());
    }

    @Override
@ -64,12 +67,13 @@ public abstract class BaseRecurrentLayer extends FeedForwardLayer {
        if (nIn <= 0 || override) {
            InputType.InputTypeRecurrent r = (InputType.InputTypeRecurrent) inputType;
            this.nIn = r.getSize();
+            this.rnnDataFormat = r.getFormat();
        }
    }

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, rnnDataFormat,getLayerName());
    }

    @NoArgsConstructor
@ -77,6 +81,12 @@ public abstract class BaseRecurrentLayer extends FeedForwardLayer {
    @Setter
    public static abstract class Builder<T extends Builder<T>> extends FeedForwardLayer.Builder<T> {

+        /**
+         * Set the format of data expected by the RNN. NCW = [miniBatchSize, size, timeSeriesLength],
+         * NWC = [miniBatchSize, timeSeriesLength, size]. Defaults to NCW.
+         */
+        protected RNNFormat rnnDataFormat = RNNFormat.NCW;
+
        /**
         * Set constraints to be applied to the RNN recurrent weight parameters of this layer. Default: no
         * constraints.<br> Constraints can be used to enforce certain conditions (non-negativity of parameters,
@ -163,5 +173,10 @@ public abstract class BaseRecurrentLayer extends FeedForwardLayer {
            this.setWeightInitFnRecurrent(new WeightInitDistribution(dist));
            return (T) this;
        }
+
+        public T dataFormat(RNNFormat rnnDataFormat){
+            this.rnnDataFormat = rnnDataFormat;
+            return (T)this;
+        }
    }
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Convolution1DLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Convolution1DLayer.java
@ -22,6 +22,7 @@ import lombok.NoArgsConstructor;
 import lombok.ToString;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.optimize.api.TrainingListener;
 import org.deeplearning4j.util.Convolution1DUtils;
@ -114,7 +115,7 @@ public class Convolution1DLayer extends ConvolutionLayer {
                            + "\"): input is null");
        }

-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW,getLayerName());
    }

    public static class Builder extends ConvolutionLayer.BaseConvBuilder<Builder> {
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/FeedForwardLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/FeedForwardLayer.java
@ -87,7 +87,7 @@ public abstract class FeedForwardLayer extends BaseLayer {
                return null;
            case RNN:
                //RNN -> FF
-                return new RnnToFeedForwardPreProcessor();
+                return new RnnToFeedForwardPreProcessor(((InputType.InputTypeRecurrent)inputType).getFormat());
            case CNN:
                //CNN -> FF
                InputType.InputTypeConvolutional c = (InputType.InputTypeConvolutional) inputType;
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/InputTypeUtil.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/InputTypeUtil.java
@ -22,6 +22,7 @@ import org.deeplearning4j.exception.DL4JInvalidConfigException;
 import org.deeplearning4j.nn.conf.CNN2DFormat;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.preprocessor.CnnToRnnPreProcessor;
 import org.deeplearning4j.nn.conf.preprocessor.FeedForwardToCnnPreProcessor;
@ -528,7 +529,7 @@ public class InputTypeUtil {
        }
    }

-    public static InputPreProcessor getPreprocessorForInputTypeRnnLayers(InputType inputType, String layerName) {
+    public static InputPreProcessor getPreprocessorForInputTypeRnnLayers(InputType inputType, RNNFormat rnnDataFormat, String layerName) {
        if (inputType == null) {
            throw new IllegalStateException(
                            "Invalid input for RNN layer (layer name = \"" + layerName + "\"): input type is null");
@ -539,14 +540,14 @@ public class InputTypeUtil {
            case CNNFlat:
                //FF -> RNN or CNNFlat -> RNN
                //In either case, input data format is a row vector per example
-                return new FeedForwardToRnnPreProcessor();
+                return new FeedForwardToRnnPreProcessor(rnnDataFormat);
            case RNN:
                //RNN -> RNN: No preprocessor necessary
                return null;
            case CNN:
                //CNN -> RNN
                InputType.InputTypeConvolutional c = (InputType.InputTypeConvolutional) inputType;
-                return new CnnToRnnPreProcessor(c.getHeight(), c.getWidth(), c.getChannels());
+                return new CnnToRnnPreProcessor(c.getHeight(), c.getWidth(), c.getChannels(), rnnDataFormat);
            default:
                throw new RuntimeException("Unknown input type: " + inputType);
        }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LearnedSelfAttentionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LearnedSelfAttentionLayer.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.conf.layers;
 import lombok.*;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.samediff.SDLayerParams;
 import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer;
@ -86,7 +87,7 @@ public class LearnedSelfAttentionLayer extends SameDiffLayer {

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW,getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocallyConnected1D.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocallyConnected1D.java
@ -20,6 +20,7 @@ import lombok.*;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.samediff.SDLayerParams;
 import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer;
@ -136,7 +137,7 @@ public class LocallyConnected1D extends SameDiffLayer {

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW, getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RecurrentAttentionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RecurrentAttentionLayer.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.conf.layers;
 import lombok.*;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.samediff.SDLayerParams;
 import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer;
@ -92,7 +93,7 @@ public class RecurrentAttentionLayer extends SameDiffLayer {

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW, getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RnnLossLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RnnLossLayer.java
@ -24,6 +24,7 @@ import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.ParamInitializer;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.memory.LayerMemoryReport;
 import org.deeplearning4j.nn.conf.memory.MemoryReport;
@ -53,12 +54,13 @@ import java.util.Map;
@ToString(callSuper = true)
@EqualsAndHashCode(callSuper = true)
 public class RnnLossLayer extends FeedForwardLayer {
-
+    private RNNFormat rnnDataFormat = RNNFormat.NCW;
    protected ILossFunction lossFn;

    private RnnLossLayer(Builder builder) {
        super(builder);
        this.setLossFn(builder.lossFn);
+        this.rnnDataFormat = builder.rnnDataFormat;
    }

    @Override
@ -91,7 +93,7 @@ public class RnnLossLayer extends FeedForwardLayer {

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW, getLayerName());
    }

    @Override
@ -111,8 +113,9 @@ public class RnnLossLayer extends FeedForwardLayer {

    public static class Builder extends BaseOutputLayer.Builder<Builder> {

-        public Builder() {
+        private RNNFormat rnnDataFormat = RNNFormat.NCW;

+        public Builder() {
        }

        /**
@ -153,6 +156,14 @@ public class RnnLossLayer extends FeedForwardLayer {
                    "This layer has no parameters, thus nIn will always equal nOut.");
        }

+        /**
+         * @param rnnDataFormat Data format expected by the layer. NCW = [miniBatchSize, size, timeSeriesLength],
+         * NWC = [miniBatchSize, timeSeriesLength, size]. Defaults to NCW.
+         */
+        public Builder dataFormat(RNNFormat rnnDataFormat){
+            this.rnnDataFormat = rnnDataFormat;
+            return this;
+        }
        @Override
        @SuppressWarnings("unchecked")
        public RnnLossLayer build() {
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RnnOutputLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/RnnOutputLayer.java
@ -24,6 +24,7 @@ import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.ParamInitializer;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.params.DefaultParamInitializer;
 import org.deeplearning4j.optimize.api.TrainingListener;
@ -51,9 +52,11 @@ import java.util.Map;
@EqualsAndHashCode(callSuper = true)
 public class RnnOutputLayer extends BaseOutputLayer {

+    private RNNFormat rnnDataFormat = RNNFormat.NCW;
    private RnnOutputLayer(Builder builder) {
        super(builder);
        initializeConstraints(builder);
+        this.rnnDataFormat = builder.rnnDataFormat;
    }

    @Override
@ -85,7 +88,7 @@ public class RnnOutputLayer extends BaseOutputLayer {
        }
        InputType.InputTypeRecurrent itr = (InputType.InputTypeRecurrent) inputType;

-        return InputType.recurrent(nOut, itr.getTimeSeriesLength());
+        return InputType.recurrent(nOut, itr.getTimeSeriesLength(), itr.getFormat());
    }

    @Override
@ -97,18 +100,20 @@ public class RnnOutputLayer extends BaseOutputLayer {

        if (nIn <= 0 || override) {
            InputType.InputTypeRecurrent r = (InputType.InputTypeRecurrent) inputType;
+            this.rnnDataFormat = r.getFormat();
            this.nIn = r.getSize();
        }
    }

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, rnnDataFormat, getLayerName());
    }


    public static class Builder extends BaseOutputLayer.Builder<Builder> {

+        private RNNFormat rnnDataFormat = RNNFormat.NCW;
        public Builder() {
            //Set default activation function to softmax (to match default loss function MCXENT)
            this.setActivationFn(new ActivationSoftmax());
@ -137,5 +142,14 @@ public class RnnOutputLayer extends BaseOutputLayer {
        public RnnOutputLayer build() {
            return new RnnOutputLayer(this);
        }
+
+        /**
+         * @param rnnDataFormat Data format expected by the layer. NCW = [miniBatchSize, size, timeSeriesLength],
+         * NWC = [miniBatchSize, timeSeriesLength, size]. Defaults to NCW.
+         */
+        public Builder dataFormat(RNNFormat rnnDataFormat){
+            this.rnnDataFormat = rnnDataFormat;
+            return this;
+        }
    }
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SelfAttentionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SelfAttentionLayer.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.conf.layers;

 import lombok.*;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.samediff.SDLayerParams;
 import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer;
@ -75,7 +76,7 @@ public class SelfAttentionLayer extends SameDiffLayer {

    @Override
    public InputPreProcessor getPreProcessorForInputType(InputType inputType) {
-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW,getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling1DLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling1DLayer.java
@ -22,6 +22,7 @@ import lombok.NoArgsConstructor;
 import lombok.ToString;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.optimize.api.TrainingListener;
 import org.deeplearning4j.util.Convolution1DUtils;
@ -105,7 +106,7 @@ public class Subsampling1DLayer extends SubsamplingLayer {
                            + "\"): input is null");
        }

-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW, getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ZeroPadding1DLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ZeroPadding1DLayer.java
@ -20,6 +20,7 @@ import lombok.*;
 import org.deeplearning4j.nn.api.ParamInitializer;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.memory.LayerMemoryReport;
 import org.deeplearning4j.nn.conf.memory.MemoryReport;
@ -104,7 +105,7 @@ public class ZeroPadding1DLayer extends NoParamLayer {
                            + "\"): input is null");
        }

-        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, getLayerName());
+        return InputTypeUtil.getPreprocessorForInputTypeRnnLayers(inputType, RNNFormat.NCW, getLayerName());
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/Bidirectional.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/Bidirectional.java
@ -21,6 +21,7 @@ import org.deeplearning4j.nn.api.ParamInitializer;
 import org.deeplearning4j.nn.conf.GradientNormalization;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.BaseRecurrentLayer;
 import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
@ -30,6 +31,7 @@ import org.deeplearning4j.nn.conf.memory.LayerMemoryReport;
 import org.deeplearning4j.nn.layers.recurrent.BidirectionalLayer;
 import org.deeplearning4j.nn.params.BidirectionalParamInitializer;
 import org.deeplearning4j.optimize.api.TrainingListener;
+import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.learning.config.IUpdater;
@ -124,6 +126,10 @@ public class Bidirectional extends Layer {
        }
    }

+    public RNNFormat getRNNDataFormat(){
+        return TimeSeriesUtils.getFormatFromRnnLayer(fwd);
+    }
+
    @Override
    public org.deeplearning4j.nn.api.Layer instantiate(NeuralNetConfiguration conf,
                                                       Collection<TrainingListener> trainingListeners, int layerIndex, INDArray layerParamsView,
@ -170,7 +176,7 @@ public class Bidirectional extends Layer {
        } else {
            InputType.InputTypeRecurrent r = (InputType.InputTypeRecurrent) outOrig;
            if (mode == Mode.CONCAT) {
-                return InputType.recurrent(2 * r.getSize());
+                return InputType.recurrent(2 * r.getSize(), getRNNDataFormat());
            } else {
                return r;
            }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/recurrent/TimeDistributed.java
@ -5,6 +5,7 @@ import lombok.EqualsAndHashCode;
 import lombok.NonNull;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Layer;
 import org.deeplearning4j.nn.conf.layers.wrapper.BaseWrapperLayer;
@ -29,17 +30,19 @@ import java.util.Collection;
@EqualsAndHashCode(callSuper = true)
 public class TimeDistributed extends BaseWrapperLayer {

-    private final int timeAxis;
+    private RNNFormat rnnDataFormat = RNNFormat.NCW;

    /**
     * @param underlying Underlying (internal) layer - should be a feed forward type such as DenseLayer
-     * @param timeAxis   Time axis, should be 2 for DL4J RNN activations (shape [minibatch, size, sequenceLength])
     */
-    public TimeDistributed(@JsonProperty("underlying") @NonNull Layer underlying, @JsonProperty("timeAxis") int timeAxis) {
+    public TimeDistributed(@JsonProperty("underlying") @NonNull Layer underlying, @JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat) {
        super(underlying);
-        this.timeAxis = timeAxis;
+        this.rnnDataFormat = rnnDataFormat;
    }

+    public TimeDistributed(Layer underlying){
+        super(underlying);
+    }

    @Override
    public org.deeplearning4j.nn.api.Layer instantiate(NeuralNetConfiguration conf, Collection<TrainingListener> trainingListeners,
@ -47,7 +50,7 @@ public class TimeDistributed extends BaseWrapperLayer {
        NeuralNetConfiguration conf2 = conf.clone();
        conf2.setLayer(((TimeDistributed) conf2.getLayer()).getUnderlying());
        return new TimeDistributedLayer(underlying.instantiate(conf2, trainingListeners, layerIndex, layerParamsView,
-                initializeParams, networkDataType), timeAxis);
+                initializeParams, networkDataType), rnnDataFormat);
    }

    @Override
@ -59,7 +62,7 @@ public class TimeDistributed extends BaseWrapperLayer {
        InputType.InputTypeRecurrent rnn = (InputType.InputTypeRecurrent) inputType;
        InputType ff = InputType.feedForward(rnn.getSize());
        InputType ffOut = underlying.getOutputType(layerIndex, ff);
-        return InputType.recurrent(ffOut.arrayElementsPerExample(), rnn.getTimeSeriesLength());
+        return InputType.recurrent(ffOut.arrayElementsPerExample(), rnn.getTimeSeriesLength(), rnnDataFormat);
    }

    @Override
@ -70,6 +73,7 @@ public class TimeDistributed extends BaseWrapperLayer {

        InputType.InputTypeRecurrent rnn = (InputType.InputTypeRecurrent) inputType;
        InputType ff = InputType.feedForward(rnn.getSize());
+        this.rnnDataFormat = rnn.getFormat();
        underlying.setNIn(ff, override);
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/CnnToRnnPreProcessor.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/CnnToRnnPreProcessor.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.conf.preprocessor;
 import lombok.*;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.base.Preconditions;
@ -38,7 +39,7 @@ import java.util.Arrays;
 * Functionally equivalent to combining CnnToFeedForwardPreProcessor + FeedForwardToRnnPreProcessor<br>
 * Specifically, this does two things:<br>
 * (a) Reshape 4d activations out of CNN layer, with shape [timeSeriesLength*miniBatchSize, numChannels, inputHeight, inputWidth])
- * into 3d (time series) activations (with shape [numExamples, inputHeight*inputWidth*numChannels, timeSeriesLength])
+ * into 3d (time series) activations (with shape [miniBatchSize, inputHeight*inputWidth*numChannels, timeSeriesLength])
 * for use in RNN layers<br>
 * (b) Reshapes 3d epsilons (weights.*deltas) out of RNN layer (with shape
 * [miniBatchSize,inputHeight*inputWidth*numChannels,timeSeriesLength]) into 4d epsilons with shape
@ -52,6 +53,7 @@ public class CnnToRnnPreProcessor implements InputPreProcessor {
    private long inputHeight;
    private long inputWidth;
    private long numChannels;
+    private RNNFormat rnnDataFormat = RNNFormat.NCW;

    @Getter(AccessLevel.NONE)
    @Setter(AccessLevel.NONE)
@ -59,11 +61,20 @@ public class CnnToRnnPreProcessor implements InputPreProcessor {

    @JsonCreator
    public CnnToRnnPreProcessor(@JsonProperty("inputHeight") long inputHeight,
-                    @JsonProperty("inputWidth") long inputWidth, @JsonProperty("numChannels") long numChannels) {
+                                @JsonProperty("inputWidth") long inputWidth,
+                                @JsonProperty("numChannels") long numChannels,
+                                @JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat) {
        this.inputHeight = inputHeight;
        this.inputWidth = inputWidth;
        this.numChannels = numChannels;
        this.product = inputHeight * inputWidth * numChannels;
+        this.rnnDataFormat = rnnDataFormat;
+    }
+
+    public CnnToRnnPreProcessor(long inputHeight,
+                                long inputWidth,
+                                long numChannels){
+        this(inputHeight, inputWidth, numChannels, RNNFormat.NCW);
    }

    @Override
@ -90,14 +101,19 @@ public class CnnToRnnPreProcessor implements InputPreProcessor {
        //Second: reshape 2d to 3d, as per FeedForwardToRnnPreProcessor
        INDArray reshaped = workspaceMgr.dup(ArrayType.ACTIVATIONS, twod, 'f');
        reshaped = reshaped.reshape('f', miniBatchSize, shape[0] / miniBatchSize, product);
-        return reshaped.permute(0, 2, 1);
+        if (rnnDataFormat == RNNFormat.NCW) {
+            return reshaped.permute(0, 2, 1);
+        }
+        return reshaped;
    }

    @Override
    public INDArray backprop(INDArray output, int miniBatchSize, LayerWorkspaceMgr workspaceMgr) {
        if (output.ordering() == 'c' || !Shape.hasDefaultStridesForShape(output))
            output = output.dup('f');
-
+        if (rnnDataFormat == RNNFormat.NWC){
+            output = output.permute(0, 2, 1);
+        }
        val shape = output.shape();
        INDArray output2d;
        if (shape[0] == 1) {
@ -122,7 +138,7 @@ public class CnnToRnnPreProcessor implements InputPreProcessor {

    @Override
    public CnnToRnnPreProcessor clone() {
-        return new CnnToRnnPreProcessor(inputHeight, inputWidth, numChannels);
+        return new CnnToRnnPreProcessor(inputHeight, inputWidth, numChannels, rnnDataFormat);
    }

    @Override
@ -133,7 +149,7 @@ public class CnnToRnnPreProcessor implements InputPreProcessor {

        InputType.InputTypeConvolutional c = (InputType.InputTypeConvolutional) inputType;
        val outSize = c.getChannels() * c.getHeight() * c.getWidth();
-        return InputType.recurrent(outSize);
+        return InputType.recurrent(outSize, rnnDataFormat);
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/FeedForwardToRnnPreProcessor.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/FeedForwardToRnnPreProcessor.java
@ -21,6 +21,7 @@ import lombok.NoArgsConstructor;
 import lombok.val;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -28,7 +29,7 @@ import org.nd4j.linalg.api.shape.Shape;
 import org.nd4j.linalg.primitives.Pair;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.nn.workspace.ArrayType;
-
+import org.nd4j.shade.jackson.annotation.JsonProperty;
 import java.util.Arrays;

 /**
@ -48,7 +49,11 @@ import java.util.Arrays;
@Data
@NoArgsConstructor
 public class FeedForwardToRnnPreProcessor implements InputPreProcessor {
+    private RNNFormat rnnDataFormat = RNNFormat.NCW;

+    public FeedForwardToRnnPreProcessor(@JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
    @Override
    public INDArray preProcess(INDArray input, int miniBatchSize, LayerWorkspaceMgr workspaceMgr) {
        //Need to reshape FF activations (2d) activations to 3d (for input into RNN layer)
@ -60,7 +65,10 @@ public class FeedForwardToRnnPreProcessor implements InputPreProcessor {

        val shape = input.shape();
        INDArray reshaped = input.reshape('f', miniBatchSize, shape[0] / miniBatchSize, shape[1]);
-        return workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, reshaped.permute(0, 2, 1));
+        if (rnnDataFormat == RNNFormat.NCW){
+            reshaped = reshaped.permute(0, 2, 1);
+        }
+        return workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, reshaped);
    }

    @Override
@ -71,6 +79,9 @@ public class FeedForwardToRnnPreProcessor implements InputPreProcessor {
                            "Invalid input: expect NDArray with rank 3 (i.e., epsilons from RNN layer)");
        if (output.ordering() != 'f' || !Shape.hasDefaultStridesForShape(output))
            output = workspaceMgr.dup(ArrayType.ACTIVATION_GRAD, output, 'f');
+        if (rnnDataFormat == RNNFormat.NWC){
+            output = output.permute(0, 2, 1);
+        }
        val shape = output.shape();

        INDArray ret;
@ -87,12 +98,7 @@ public class FeedForwardToRnnPreProcessor implements InputPreProcessor {

    @Override
    public FeedForwardToRnnPreProcessor clone() {
-        try {
-            FeedForwardToRnnPreProcessor clone = (FeedForwardToRnnPreProcessor) super.clone();
-            return clone;
-        } catch (CloneNotSupportedException e) {
-            throw new RuntimeException(e);
-        }
+        return new FeedForwardToRnnPreProcessor(rnnDataFormat);
    }

    @Override
@ -104,10 +110,10 @@ public class FeedForwardToRnnPreProcessor implements InputPreProcessor {

        if (inputType.getType() == InputType.Type.FF) {
            InputType.InputTypeFeedForward ff = (InputType.InputTypeFeedForward) inputType;
-            return InputType.recurrent(ff.getSize());
+            return InputType.recurrent(ff.getSize(), rnnDataFormat);
        } else {
            InputType.InputTypeConvolutionalFlat cf = (InputType.InputTypeConvolutionalFlat) inputType;
-            return InputType.recurrent(cf.getFlattenedSize());
+            return InputType.recurrent(cf.getFlattenedSize(), rnnDataFormat);
        }
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/RnnToCnnPreProcessor.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/RnnToCnnPreProcessor.java
@ -19,8 +19,10 @@ package org.deeplearning4j.nn.conf.preprocessor;
 import lombok.*;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.util.TimeSeriesUtils;
+import org.nd4j.enums.RnnDataFormat;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.shape.Shape;
 import org.nd4j.linalg.primitives.Pair;
@ -52,19 +54,27 @@ public class RnnToCnnPreProcessor implements InputPreProcessor {
    private int inputHeight;
    private int inputWidth;
    private int numChannels;
-
+    private RNNFormat rnnDataFormat = RNNFormat.NCW;
    @Getter(AccessLevel.NONE)
    @Setter(AccessLevel.NONE)
    private int product;

    public RnnToCnnPreProcessor(@JsonProperty("inputHeight") int inputHeight,
-                    @JsonProperty("inputWidth") int inputWidth, @JsonProperty("numChannels") int numChannels) {
+                                @JsonProperty("inputWidth") int inputWidth,
+                                @JsonProperty("numChannels") int numChannels,
+                                @JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat) {
        this.inputHeight = inputHeight;
        this.inputWidth = inputWidth;
        this.numChannels = numChannels;
        this.product = inputHeight * inputWidth * numChannels;
+        this.rnnDataFormat = rnnDataFormat;
    }

+    public RnnToCnnPreProcessor(int inputHeight,
+                                int inputWidth,
+                                int numChannels){
+        this(inputHeight, inputWidth, numChannels, RNNFormat.NCW);
+    }

    @Override
    public INDArray preProcess(INDArray input, int miniBatchSize, LayerWorkspaceMgr workspaceMgr) {
@ -72,6 +82,9 @@ public class RnnToCnnPreProcessor implements InputPreProcessor {
            input = input.dup('f');
        //Input: 3d activations (RNN)
        //Output: 4d activations (CNN)
+        if (rnnDataFormat == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        val shape = input.shape();
        INDArray in2d;
        if (shape[0] == 1) {
@ -98,14 +111,17 @@ public class RnnToCnnPreProcessor implements InputPreProcessor {
        val shape = output.shape();
        //First: reshape 4d to 2d
        INDArray twod = output.reshape('c', output.size(0), ArrayUtil.prod(output.shape()) / output.size(0));
-        //Second: reshape 2d to 4d
+        //Second: reshape 2d to 3d
        INDArray reshaped = workspaceMgr.dup(ArrayType.ACTIVATION_GRAD, twod, 'f').reshape('f', miniBatchSize, shape[0] / miniBatchSize, product);
-        return reshaped.permute(0, 2, 1);
+        if (rnnDataFormat == RNNFormat.NCW) {
+            reshaped = reshaped.permute(0, 2, 1);
+        }
+        return reshaped;
    }

    @Override
    public RnnToCnnPreProcessor clone() {
-        return new RnnToCnnPreProcessor(inputHeight, inputWidth, numChannels);
+        return new RnnToCnnPreProcessor(inputHeight, inputWidth, numChannels, rnnDataFormat);
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/RnnToFeedForwardPreProcessor.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/preprocessor/RnnToFeedForwardPreProcessor.java
@ -16,11 +16,14 @@

 package org.deeplearning4j.nn.conf.preprocessor;

+import lombok.AllArgsConstructor;
 import lombok.Data;
+import lombok.NoArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.util.TimeSeriesUtils;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -28,6 +31,7 @@ import org.nd4j.linalg.api.shape.Shape;
 import org.nd4j.linalg.primitives.Pair;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.nn.workspace.ArrayType;
+import org.nd4j.shade.jackson.annotation.JsonProperty;

 import java.util.Arrays;

@ -47,8 +51,14 @@ import java.util.Arrays;
 */
@Data
@Slf4j
+@NoArgsConstructor
 public class RnnToFeedForwardPreProcessor implements InputPreProcessor {

+    private RNNFormat rnnDataFormat = RNNFormat.NCW;
+
+    public RnnToFeedForwardPreProcessor(@JsonProperty("rnnDataFormat") RNNFormat rnnDataFormat){
+        this.rnnDataFormat = rnnDataFormat;
+    }
    @Override
    public INDArray preProcess(INDArray input, int miniBatchSize, LayerWorkspaceMgr workspaceMgr) {
        //Need to reshape RNN activations (3d) activations to 2d (for input into feed forward layer)
@ -59,10 +69,13 @@ public class RnnToFeedForwardPreProcessor implements InputPreProcessor {
        if (input.ordering() != 'f' || !Shape.hasDefaultStridesForShape(input))
            input = workspaceMgr.dup(ArrayType.ACTIVATIONS, input, 'f');

+        if (rnnDataFormat == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        val shape = input.shape();
        INDArray ret;
        if (shape[0] == 1) {
-            ret = input.tensorAlongDimension(0, 1, 2).permutei(1, 0); //Edge case: miniBatchSize==1
+            ret = input.tensorAlongDimension(0, 1, 2).permute(1, 0); //Edge case: miniBatchSize==1
        } else if (shape[2] == 1) {
            ret = input.tensorAlongDimension(0, 1, 0); //Edge case: timeSeriesLength=1
        } else {
@ -85,17 +98,15 @@ public class RnnToFeedForwardPreProcessor implements InputPreProcessor {

        val shape = output.shape();
        INDArray reshaped = output.reshape('f', miniBatchSize, shape[0] / miniBatchSize, shape[1]);
-        return workspaceMgr.leverageTo(ArrayType.ACTIVATION_GRAD, reshaped.permute(0, 2, 1));
+        if (rnnDataFormat == RNNFormat.NCW){
+            reshaped = reshaped.permute(0, 2, 1);
+        }
+        return workspaceMgr.leverageTo(ArrayType.ACTIVATION_GRAD, reshaped);
    }

    @Override
    public RnnToFeedForwardPreProcessor clone() {
-        try {
-            RnnToFeedForwardPreProcessor clone = (RnnToFeedForwardPreProcessor) super.clone();
-            return clone;
-        } catch (CloneNotSupportedException e) {
-            throw new RuntimeException(e);
-        }
+        return new RnnToFeedForwardPreProcessor(rnnDataFormat);
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BaseRecurrentLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BaseRecurrentLayer.java
@ -18,7 +18,10 @@ package org.deeplearning4j.nn.layers.recurrent;

 import org.deeplearning4j.nn.api.layers.RecurrentLayer;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
+import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.layers.BaseLayer;
+import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;

@ -26,7 +29,7 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;

-public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.nn.conf.layers.BaseLayer>
+public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.nn.conf.layers.BaseRecurrentLayer>
                extends BaseLayer<LayerConfT> implements RecurrentLayer {

    /**
@ -85,4 +88,19 @@ public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.n
        tBpttStateMap.putAll(state);
    }

+    public RNNFormat getDataFormat(){
+        return layerConf().getRnnDataFormat();
+    }
+
+    protected INDArray permuteIfNWC(INDArray arr){
+        if (arr == null){
+            return null;
+        }
+        if (getDataFormat() == RNNFormat.NWC){
+            return arr.permute(0, 2, 1);
+        }
+        return arr;
+    }
+
+
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BidirectionalLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BidirectionalLayer.java
@ -25,6 +25,7 @@ import org.deeplearning4j.nn.api.TrainingConfig;
 import org.deeplearning4j.nn.api.layers.RecurrentLayer;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
@ -78,6 +79,9 @@ public class BidirectionalLayer implements RecurrentLayer {
        this.paramsView = paramsView;
    }

+    private RNNFormat getRNNDataFormat(){
+        return layerConf.getRNNDataFormat();
+    }
    @Override
    public INDArray rnnTimeStep(INDArray input, LayerWorkspaceMgr workspaceMgr) {
        throw new UnsupportedOperationException("Cannot RnnTimeStep bidirectional layers");
@ -140,7 +144,10 @@ public class BidirectionalLayer implements RecurrentLayer {
    public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
        INDArray eFwd;
        INDArray eBwd;
-
+        boolean permute = getRNNDataFormat() == RNNFormat.NWC && epsilon.rank() == 3;
+        if (permute){
+            epsilon = epsilon.permute(0, 2, 1);
+        }
        val n = epsilon.size(1)/2;
        switch (layerConf.getMode()){
            case ADD:
@ -165,6 +172,10 @@ public class BidirectionalLayer implements RecurrentLayer {

        eBwd = TimeSeriesUtils.reverseTimeSeries(eBwd, workspaceMgr, ArrayType.BP_WORKING_MEM);

+        if (permute){
+            eFwd = eFwd.permute(0, 2, 1);
+            eBwd = eBwd.permute(0, 2, 1);
+        }
        Pair<Gradient,INDArray> g1 = fwd.backpropGradient(eFwd, workspaceMgr);
        Pair<Gradient,INDArray> g2 = bwd.backpropGradient(eBwd, workspaceMgr);

@ -176,7 +187,9 @@ public class BidirectionalLayer implements RecurrentLayer {
            g.gradientForVariable().put(BidirectionalParamInitializer.BACKWARD_PREFIX + e.getKey(), e.getValue());
        }

-        INDArray g2Reversed = TimeSeriesUtils.reverseTimeSeries(g2.getRight(), workspaceMgr, ArrayType.BP_WORKING_MEM);
+        INDArray g2Right = permute ? g2.getRight().permute(0, 2, 1): g2.getRight();
+        INDArray g2Reversed = TimeSeriesUtils.reverseTimeSeries(g2Right, workspaceMgr, ArrayType.BP_WORKING_MEM);
+        g2Reversed = permute? g2Reversed.permute(0, 2, 1): g2Reversed;
        INDArray epsOut = g1.getRight().addi(g2Reversed);

        return new Pair<>(g, epsOut);
@ -186,25 +199,38 @@ public class BidirectionalLayer implements RecurrentLayer {
    public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
        INDArray out1 = fwd.activate(training, workspaceMgr);
        INDArray out2 = bwd.activate(training, workspaceMgr);
+        boolean permute = getRNNDataFormat() == RNNFormat.NWC && out1.rank() == 3;
+        if(permute){
+            out1 = out1.permute(0, 2, 1);
+            out2 = out2.permute(0, 2, 1);
+        }
        //Reverse the output time series. Note: when using LastTimeStepLayer, output can be rank 2
        out2 = out2.rank() == 2 ? out2 : TimeSeriesUtils.reverseTimeSeries(out2, workspaceMgr, ArrayType.FF_WORKING_MEM);
-
+        INDArray ret;
        switch (layerConf.getMode()){
            case ADD:
-                return out1.addi(out2);
+                ret = out1.addi(out2);
+                break;
            case MUL:
                //TODO may be more efficient ways than this...
                this.outFwd = out1.detach();
                this.outBwd = out2.detach();
-                return workspaceMgr.dup(ArrayType.ACTIVATIONS, out1).muli(out2);
+                ret = workspaceMgr.dup(ArrayType.ACTIVATIONS, out1).muli(out2);
+                break;
            case AVERAGE:
-                return out1.addi(out2).muli(0.5);
+                ret = out1.addi(out2).muli(0.5);
+                break;
            case CONCAT:
-                INDArray ret = Nd4j.concat(1, out1, out2);
-                return workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, ret);
+                ret = Nd4j.concat(1, out1, out2);
+                ret = workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, ret);
+                break;
            default:
                throw new RuntimeException("Unknown mode: " + layerConf.getMode());
        }
+        if (permute){
+            ret = ret.permute(0, 2, 1);
+        }
+        return ret;
    }

    @Override
@ -465,7 +491,9 @@ public class BidirectionalLayer implements RecurrentLayer {
    public void setInput(INDArray input, LayerWorkspaceMgr layerWorkspaceMgr) {
        this.input = input;
        fwd.setInput(input, layerWorkspaceMgr);
-
+        if (getRNNDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        INDArray reversed;
        if(!input.isAttached()){
            try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
@ -478,6 +506,9 @@ public class BidirectionalLayer implements RecurrentLayer {
                reversed = TimeSeriesUtils.reverseTimeSeries(input);
            }
        }
+        if (getRNNDataFormat() == RNNFormat.NWC){
+            reversed = reversed.permute(0, 2, 1);
+        }
        bwd.setInput(reversed, layerWorkspaceMgr);
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
@ -88,12 +88,12 @@ public class GravesBidirectionalLSTM
        }

        final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);
-
+        fwdPass.fwdPassOutput = permuteIfNWC(fwdPass.fwdPassOutput);
        final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this,
                        this.conf,
-                        this.layerConf().getGateActivationFn(), this.input,
+                        this.layerConf().getGateActivationFn(), permuteIfNWC(this.input),
                        getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
-                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
+                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), permuteIfNWC(epsilon),
                        truncatedBPTT, tbpttBackwardLength, fwdPass, true,
                        GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
                        GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
@ -106,16 +106,17 @@ public class GravesBidirectionalLSTM

        final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this,
                        this.conf,
-                        this.layerConf().getGateActivationFn(), this.input,
+                        this.layerConf().getGateActivationFn(), permuteIfNWC(this.input),
                        getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
-                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
+                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), permuteIfNWC(epsilon),
                        truncatedBPTT, tbpttBackwardLength, backPass, false,
                        GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
                        GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
                        GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
                        null, workspaceMgr, layerConf().isHelperAllowFallback());

-
+        forwardsGradient.setSecond(permuteIfNWC(forwardsGradient.getSecond()));
+        backwardsGradient.setSecond(permuteIfNWC(backwardsGradient.getSecond()));
        //merge the gradient, which is key value pair of String,INDArray
        //the keys for forwards and backwards should be different

@ -171,7 +172,7 @@ public class GravesBidirectionalLSTM
        } else {

            forwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
-                            this.input, getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
+                            permuteIfNWC(this.input), getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
                            getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
                            getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
                            forBackprop || (cacheMode != CacheMode.NONE && training), true,
@ -179,7 +180,7 @@ public class GravesBidirectionalLSTM
                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

            backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
-                            this.input,
+                            permuteIfNWC(this.input),
                            getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
                            getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS),
                            getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
@ -187,6 +188,8 @@ public class GravesBidirectionalLSTM
                            GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

+            forwardsEval.fwdPassOutput = permuteIfNWC(forwardsEval.fwdPassOutput);
+            backwardsEval.fwdPassOutput = permuteIfNWC(backwardsEval.fwdPassOutput);
            cachedPassForward = forwardsEval;
            cachedPassBackward = backwardsEval;
        }
@ -228,10 +231,12 @@ public class GravesBidirectionalLSTM
                biasKey = GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS;
            }

-            return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
+            FwdPassReturn ret = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), permuteIfNWC(this.input),
                            getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
                            prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
                            null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
+            ret.fwdPassOutput = permuteIfNWC(ret.fwdPassOutput);
+            return ret;
        }
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
@ -20,6 +20,7 @@ import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.params.GravesLSTMParamInitializer;
 import org.nd4j.base.Preconditions;
@ -89,17 +90,17 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
        } else {
            fwdPass = activateHelper(true, null, null, true, workspaceMgr);
        }
-
+        fwdPass.fwdPassOutput = permuteIfNWC(fwdPass.fwdPassOutput);

        Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this,
-                        this.conf, this.layerConf().getGateActivationFn(), this.input,
-                        recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
+                        this.conf, this.layerConf().getGateActivationFn(), permuteIfNWC(this.input),
+                        recurrentWeights, inputWeights, permuteIfNWC(epsilon), truncatedBPTT, tbpttBackwardLength, fwdPass, true,
                        GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
                        GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
                        workspaceMgr, layerConf().isHelperAllowFallback());

        weightNoiseParams.clear();
-        p.setSecond(backpropDropOutIfPresent(p.getSecond()));
+        p.setSecond(permuteIfNWC(backpropDropOutIfPresent(p.getSecond())));
        return p;
    }

@ -117,8 +118,8 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
    private FwdPassReturn activateHelper(final boolean training, final INDArray prevOutputActivations,
                    final INDArray prevMemCellState, boolean forBackprop, LayerWorkspaceMgr workspaceMgr) {
        assertInputSet(false);
-        Preconditions.checkState(input.rank() == 3,
-                "3D input expected to RNN layer expected, got " + input.rank());
+        Preconditions.checkState(this.input.rank() == 3,
+                "3D input expected to RNN layer expected, got " + this.input.rank());
        applyDropOutIfNecessary(training, workspaceMgr);

 //        if (cacheMode == null)
@ -136,18 +137,17 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
        final INDArray recurrentWeights = getParamWithNoise(GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY, training, workspaceMgr); //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
        final INDArray inputWeights = getParamWithNoise(GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, training, workspaceMgr); //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
        final INDArray biases = getParamWithNoise(GravesLSTMParamInitializer.BIAS_KEY, training, workspaceMgr); //by row: IFOG			//Shape: [4,hiddenLayerSize]; order: [bi,bf,bo,bg]^T
-
+        INDArray input = permuteIfNWC(this.input);
        FwdPassReturn fwd = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
-                        this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
+                        input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
                        prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
                        GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
                        cacheMode, workspaceMgr, layerConf().isHelperAllowFallback());

-
+        fwd.fwdPassOutput = permuteIfNWC(fwd.fwdPassOutput);
        if (training && cacheMode != CacheMode.NONE) {
            cachedFwdPass = fwd;
        }
-
        return fwd;
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
@ -123,17 +123,16 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
        } else {
            fwdPass = activateHelper(true, null, null, true, workspaceMgr);
        }
-
-
+        fwdPass.fwdPassOutput = permuteIfNWC(fwdPass.fwdPassOutput);
        Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this,
-                        this.conf, this.layerConf().getGateActivationFn(), this.input,
-                        recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
+                        this.conf, this.layerConf().getGateActivationFn(), permuteIfNWC(this.input),
+                        recurrentWeights, inputWeights, permuteIfNWC(epsilon), truncatedBPTT, tbpttBackwardLength, fwdPass, true,
                        LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
                        LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr,
                        layerConf().isHelperAllowFallback());

        weightNoiseParams.clear();
-        p.setSecond(backpropDropOutIfPresent(p.getSecond()));
+        p.setSecond(permuteIfNWC(backpropDropOutIfPresent(p.getSecond())));
        return p;
    }

@ -167,17 +166,18 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
        final INDArray recurrentWeights = getParamWithNoise(LSTMParamInitializer.RECURRENT_WEIGHT_KEY, training, workspaceMgr); //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
        final INDArray inputWeights = getParamWithNoise(LSTMParamInitializer.INPUT_WEIGHT_KEY, training, workspaceMgr); //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
        final INDArray biases = getParamWithNoise(LSTMParamInitializer.BIAS_KEY, training, workspaceMgr); //by row: IFOG			//Shape: [4,hiddenLayerSize]; order: [bi,bf,bo,bg]^T
-
+        INDArray input = permuteIfNWC(this.input);
        FwdPassReturn fwd = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
-                        this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
+                        input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
                        prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
                        LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
                        forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

+        fwd.fwdPassOutput = permuteIfNWC(fwd.fwdPassOutput);
+
        if (training && cacheMode != CacheMode.NONE) {
            cachedFwdPass = fwd;
        }
-
        return fwd;
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
@ -465,7 +465,6 @@ public class LSTMHelpers {
        val miniBatchSize = epsilon.size(0);
        boolean is2dInput = epsilon.rank() < 3; //Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to [miniBatchSize,n^(L+1),1]
        val timeSeriesLength = (is2dInput ? 1 : epsilon.size(2));
-
        INDArray wFFTranspose = null;
        INDArray wOOTranspose = null;
        INDArray wGGTranspose = null;
@ -573,14 +572,14 @@ public class LSTMHelpers {
                    nablaCellState = Nd4j.create(inputWeights.dataType(), new long[]{miniBatchSize, hiddenLayerSize}, 'f');
                }

-                INDArray prevMemCellState = (iTimeIndex == 0 ? fwdPass.prevMemCell : fwdPass.memCellState[(int) (time - inext)]);
+                INDArray prevMemCellState = (iTimeIndex == 0 ? fwdPass.prevMemCell : fwdPass.memCellState[(time - inext)]);
                INDArray prevHiddenUnitActivation =
-                        (iTimeIndex == 0 ? fwdPass.prevAct : fwdPass.fwdPassOutputAsArrays[(int) (time - inext)]);
-                INDArray currMemCellState = fwdPass.memCellState[(int) time];
+                        (iTimeIndex == 0 ? fwdPass.prevAct : fwdPass.fwdPassOutputAsArrays[(time - inext)]);
+                INDArray currMemCellState = fwdPass.memCellState[time];

                //LSTM unit output errors (dL/d(a_out)); not to be confused with \delta=dL/d(z_out)
-                INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension((int) time, 1, 0)); //(w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.

+                INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension(time, 1, 0)); //(w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.
                INDArray nablaOut = Shape.toOffsetZeroCopy(epsilonSlice, 'f'); //Shape: [m,n^L]
                if (iTimeIndex != timeSeriesLength - 1) {
                    //if t == timeSeriesLength-1 then deltaiNext etc are zeros
@ -666,7 +665,7 @@ public class LSTMHelpers {
                    //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid using errors from a masked time step
                    // to calculate the parameter gradients.  Mask array has shape [minibatch, timeSeriesLength] -> get column(this time step)
                    timeStepMaskColumn = maskArray.getColumn(time, true);
-                    deltaifogNext.muliColumnVector(timeStepMaskColumn);
+                    deltaifogNext.muli(timeStepMaskColumn);
                    //Later, the deltaifogNext is used to calculate: input weight gradients, recurrent weight gradients, bias gradients
                }

@ -737,7 +736,7 @@ public class LSTMHelpers {
                if (maskArray != null) {
                    //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid sending anything
                    // but 0s to the layer below at this time step (for the given example)
-                    epsilonNextSlice.muliColumnVector(timeStepMaskColumn);
+                    epsilonNextSlice.muli(timeStepMaskColumn);
                }
            }
        }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LastTimeStepLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LastTimeStepLayer.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.layers.recurrent;
 import lombok.NonNull;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer;
 import org.deeplearning4j.util.TimeSeriesUtils;
@ -59,18 +60,41 @@ public class LastTimeStepLayer extends BaseWrapperLayer {

    @Override
    public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
-        INDArray newEps = Nd4j.create(epsilon.dataType(), origOutputShape, 'f');
+        long[] newEpsShape = origOutputShape;
+        boolean nwc = (underlying instanceof BaseRecurrentLayer &&
+                ((BaseRecurrentLayer) underlying).getDataFormat() == RNNFormat.NWC)||
+                (underlying instanceof MaskZeroLayer && ((MaskZeroLayer)underlying).getUnderlying() instanceof
+                BaseRecurrentLayer && ((BaseRecurrentLayer)((MaskZeroLayer)underlying).getUnderlying()).getDataFormat()
+                == RNNFormat.NWC);
+        INDArray newEps = Nd4j.create(epsilon.dataType(), newEpsShape, 'f');
        if(lastTimeStepIdxs == null){
            //no mask case
-            newEps.put(new INDArrayIndex[]{all(), all(), point(origOutputShape[2]-1)}, epsilon);
-        } else {
-            INDArrayIndex[] arr = new INDArrayIndex[]{null, all(), null};
-            //TODO probably possible to optimize this with reshape + scatter ops...
-            for( int i=0; i<lastTimeStepIdxs.length; i++ ){
-                arr[0] = point(i);
-                arr[2] = point(lastTimeStepIdxs[i]);
-                newEps.put(arr, epsilon.getRow(i));
+            if (nwc){
+                newEps.put(new INDArrayIndex[]{all(), point(origOutputShape[1]-1), all()}, epsilon);
            }
+            else{
+                newEps.put(new INDArrayIndex[]{all(), all(), point(origOutputShape[2]-1)}, epsilon);
+            }
+        } else {
+            if (nwc){
+                INDArrayIndex[] arr = new INDArrayIndex[]{null, null, all()};
+                //TODO probably possible to optimize this with reshape + scatter ops...
+                for( int i=0; i<lastTimeStepIdxs.length; i++ ){
+                    arr[0] = point(i);
+                    arr[1] = point(lastTimeStepIdxs[i]);
+                    newEps.put(arr, epsilon.getRow(i));
+                }
+            }
+            else{
+                INDArrayIndex[] arr = new INDArrayIndex[]{null, all(), null};
+                //TODO probably possible to optimize this with reshape + scatter ops...
+                for( int i=0; i<lastTimeStepIdxs.length; i++ ){
+                    arr[0] = point(i);
+                    arr[2] = point(lastTimeStepIdxs[i]);
+                    newEps.put(arr, epsilon.getRow(i));
+                }
+            }
+
        }
        return underlying.backpropGradient(newEps, workspaceMgr);
    }
@ -103,10 +127,18 @@ public class LastTimeStepLayer extends BaseWrapperLayer {
                    "rank " + in.rank() + " with shape " + Arrays.toString(in.shape()));
        }
        origOutputShape = in.shape();
+        boolean nwc = TimeSeriesUtils.getFormatFromRnnLayer(underlying.conf().getLayer()) == RNNFormat.NWC;
+//        underlying instanceof  BaseRecurrentLayer && ((BaseRecurrentLayer)underlying).getDataFormat() == RNNFormat.NWC)||
+//                underlying instanceof MaskZeroLayer && ((MaskZeroLayer)underlying).getUnderlying() instanceof BaseRecurrentLayer &&
+//                        ((BaseRecurrentLayer)((MaskZeroLayer)underlying).getUnderlying()).getDataFormat() == RNNFormat.NWC;
+        if (nwc){
+            in = in.permute(0, 2, 1);
+        }

        INDArray mask = underlying.getMaskArray();
        Pair<INDArray,int[]> p = TimeSeriesUtils.pullLastTimeSteps(in, mask, workspaceMgr, arrayType);
        lastTimeStepIdxs = p.getSecond();
+
        return p.getFirst();
    }
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/MaskZeroLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/MaskZeroLayer.java
@ -30,6 +30,9 @@ import org.nd4j.linalg.primitives.Pair;
 import lombok.NonNull;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;

+import static org.deeplearning4j.nn.conf.RNNFormat.NCW;
+import static org.deeplearning4j.nn.conf.RNNFormat.NWC;
+
 /**
 * Masks timesteps with activation equal to the specified masking value, defaulting to 0.0.
 * Assumes that the input shape is [batch_size, input_size, timesteps].
@ -76,7 +79,11 @@ public class MaskZeroLayer extends BaseWrapperLayer {
            throw new IllegalArgumentException("Expected input of shape [batch_size, timestep_input_size, timestep], " +
                    "got shape "+Arrays.toString(input.shape()) + " instead");
        }
-        INDArray mask = input.eq(maskingValue).castTo(input.dataType()).sum(1).neq(input.shape()[1]);
+        if ((underlying instanceof BaseRecurrentLayer &&
+                ((BaseRecurrentLayer)underlying).getDataFormat() == NWC)){
+            input = input.permute(0, 2, 1);
+        }
+        INDArray mask = input.eq(maskingValue).castTo(input.dataType()).sum(1).neq(input.shape()[1]).castTo(input.dataType());
        underlying.setMaskArray(mask.detach());
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/RnnLossLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/RnnLossLayer.java
@ -22,6 +22,7 @@ import org.deeplearning4j.eval.Evaluation;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.api.layers.IOutputLayer;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.BaseLayer;
@ -60,6 +61,8 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn
    @Override
    public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
        assertInputSet(true);
+        INDArray input = this.input;
+        INDArray labels = this.labels;
        if (input.rank() != 3)
            throw new UnsupportedOperationException(
                            "Input is not rank 3. Expected rank 3 input of shape [minibatch, size, sequenceLength]. Got input with rank " +
@ -67,6 +70,10 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn
        if (labels == null)
            throw new IllegalStateException("Labels are not set (null)");

+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+            labels = labels.permute(0, 2, 1);
+        }
        Preconditions.checkState(labels.rank() == 3, "Expected rank 3 labels array, got label array with shape %ndShape", labels);
        Preconditions.checkState(input.size(2) == labels.size(2), "Sequence lengths do not match for RnnOutputLayer input and labels:" +
                "Arrays should be rank 3 with shape [minibatch, size, sequenceLength] - mismatch on dimension 2 (sequence length) - input=%ndShape vs. label=%ndShape", input, labels);
@ -90,7 +97,9 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn
        INDArray delta2d = lossFunction.computeGradient(labels2d, input2d.dup(input2d.ordering()), layerConf().getActivationFn(), maskReshaped);

        INDArray delta3d = TimeSeriesUtils.reshape2dTo3d(delta2d, input.size(0), workspaceMgr, ArrayType.ACTIVATION_GRAD);
-
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            delta3d = delta3d.permute(0, 2, 1);
+        }
        // grab the empty gradient
        Gradient gradient = new DefaultGradient();
        return new Pair<>(gradient, delta3d);
@ -159,13 +168,21 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn
    @Override
    public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
        assertInputSet(false);
+        INDArray input = this.input;
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        if (input.rank() != 3)
            throw new UnsupportedOperationException(
                            "Input must be rank 3. Got input with rank " + input.rank() + " " + layerId());

        INDArray as2d = TimeSeriesUtils.reshape3dTo2d(input);
        INDArray out2d = layerConf().getActivationFn().getActivation(workspaceMgr.dup(ArrayType.ACTIVATIONS, as2d, as2d.ordering()), training);
-        return workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, TimeSeriesUtils.reshape2dTo3d(out2d, input.size(0), workspaceMgr, ArrayType.ACTIVATIONS));
+        INDArray ret = workspaceMgr.leverageTo(ArrayType.ACTIVATIONS, TimeSeriesUtils.reshape2dTo3d(out2d, input.size(0), workspaceMgr, ArrayType.ACTIVATIONS));
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            ret = ret.permute(0, 2, 1);
+        }
+        return ret;
    }

    @Override
@ -196,6 +213,12 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn

    @Override
    public double computeScore(double fullNetRegTerm, boolean training, LayerWorkspaceMgr workspaceMgr) {
+        INDArray input = this.input;
+        INDArray labels = this.labels;
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+            labels = input.permute(0, 2, 1);
+        }
        INDArray input2d = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.FF_WORKING_MEM);
        INDArray labels2d = TimeSeriesUtils.reshape3dTo2d(labels, workspaceMgr, ArrayType.FF_WORKING_MEM);
        INDArray maskReshaped;
@ -228,10 +251,14 @@ public class RnnLossLayer extends BaseLayer<org.deeplearning4j.nn.conf.layers.Rn
    @Override
    public INDArray computeScoreForExamples(double fullNetRegTerm, LayerWorkspaceMgr workspaceMgr) {
        //For RNN: need to sum up the score over each time step before returning.
-
+        INDArray input = this.input;
+        INDArray labels = this.labels;
        if (input == null || labels == null)
            throw new IllegalStateException("Cannot calculate score without input and labels " + layerId());
-
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+            labels = input.permute(0, 2, 1);
+        }
        INDArray input2d = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.FF_WORKING_MEM);
        INDArray labels2d = TimeSeriesUtils.reshape3dTo2d(labels, workspaceMgr, ArrayType.FF_WORKING_MEM);

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/RnnOutputLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/RnnOutputLayer.java
@ -19,6 +19,7 @@ package org.deeplearning4j.nn.layers.recurrent;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.BaseOutputLayer;
 import org.deeplearning4j.nn.params.DefaultParamInitializer;
@ -57,11 +58,15 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
                    "Input is not rank 3. RnnOutputLayer expects rank 3 input with shape [minibatch, layerInSize, sequenceLength]." +
                            " Got input with rank " + input.rank() + " and shape " + Arrays.toString(input.shape()) + " - " + layerId());
        }
+        int td = (layerConf().getRnnDataFormat()==RNNFormat.NCW)? 2: 1;
        Preconditions.checkState(labels.rank() == 3, "Expected rank 3 labels array, got label array with shape %ndShape", labels);
-        Preconditions.checkState(input.size(2) == labels.size(2), "Sequence lengths do not match for RnnOutputLayer input and labels:" +
+        Preconditions.checkState(input.size(td) == labels.size(td), "Sequence lengths do not match for RnnOutputLayer input and labels:" +
                "Arrays should be rank 3 with shape [minibatch, size, sequenceLength] - mismatch on dimension 2 (sequence length) - input=%ndShape vs. label=%ndShape", input, labels);

        INDArray inputTemp = input;
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            this.input = input.permute(0, 2, 1);
+        }
        this.input = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.BP_WORKING_MEM);

        applyDropOutIfNecessary(true, workspaceMgr);    //Edge case: we skip OutputLayer forward pass during training as this isn't required to calculate gradients
@ -71,7 +76,9 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
        INDArray epsilon2d = gradAndEpsilonNext.getSecond();

        INDArray epsilon3d = TimeSeriesUtils.reshape2dTo3d(epsilon2d, input.size(0), workspaceMgr, ArrayType.ACTIVATION_GRAD);
-
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            epsilon3d = epsilon3d.permute(0, 2, 1);
+        }
        weightNoiseParams.clear();

        //epsilon3d = backpropDropOutIfPresent(epsilon3d);
@ -104,6 +111,7 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
        if (input.rank() == 3) {
            //Case when called from RnnOutputLayer
            INDArray inputTemp = input;
+            input = (layerConf().getRnnDataFormat()==RNNFormat.NWC)? input.permute(0, 2, 1):input;
            input = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.FF_WORKING_MEM);
            INDArray out = super.preOutput(training, workspaceMgr);
            this.input = inputTemp;
@ -117,13 +125,17 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l

    @Override
    protected INDArray getLabels2d(LayerWorkspaceMgr workspaceMgr, ArrayType arrayType) {
-        if (labels.rank() == 3)
+        INDArray labels = this.labels;
+        if (labels.rank() == 3){
+            labels = (layerConf().getRnnDataFormat()==RNNFormat.NWC)?labels.permute(0, 2, 1):labels;
            return TimeSeriesUtils.reshape3dTo2d(labels, workspaceMgr, arrayType);
+        }
        return labels;
    }

    @Override
    public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
+        INDArray input = this.input;
        if (input.rank() != 3)
            throw new UnsupportedOperationException(
                            "Input must be rank 3. Got input with rank " + input.rank() + " " + layerId());
@ -131,6 +143,9 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
        INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, training, workspaceMgr);

        applyDropOutIfNecessary(training, workspaceMgr);
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            input = input.permute(0, 2, 1);
+        }
        INDArray input2d = TimeSeriesUtils.reshape3dTo2d(input.castTo(W.dataType()), workspaceMgr, ArrayType.FF_WORKING_MEM);

        INDArray act2d = layerConf().getActivationFn().getActivation(input2d.mmul(W).addiRowVector(b), training);
@ -144,7 +159,11 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
            }
        }

-        return TimeSeriesUtils.reshape2dTo3d(act2d, input.size(0), workspaceMgr, ArrayType.ACTIVATIONS);
+        INDArray ret = TimeSeriesUtils.reshape2dTo3d(act2d, input.size(0), workspaceMgr, ArrayType.ACTIVATIONS);
+        if (layerConf().getRnnDataFormat() == RNNFormat.NWC){
+            ret = ret.permute(0, 2, 1);
+        }
+        return ret;
    }

    @Override
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/SimpleRnn.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/SimpleRnn.java
@ -18,6 +18,7 @@ package org.deeplearning4j.nn.layers.recurrent;

 import lombok.val;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.params.SimpleRnnParamInitializer;
@ -50,6 +51,7 @@ import static org.nd4j.linalg.indexing.NDArrayIndex.*;
 public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.recurrent.SimpleRnn> {
    public static final String STATE_KEY_PREV_ACTIVATION = "prevAct";

+
    public SimpleRnn(NeuralNetConfiguration conf, DataType dataType) {
        super(conf, dataType);
    }
@ -92,6 +94,7 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
        val nOut = layerConf().getNOut();

        INDArray input = this.input.castTo(dataType);   //No-op if correct type
+        input = permuteIfNWC(input);

        //First: Do forward pass to get gate activations and Zs
        Quad<INDArray,INDArray, INDArray, INDArray> p = activateHelper(null, true, true, workspaceMgr);
@ -125,8 +128,9 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
        } else {
            end = 0;
        }
+        epsilon = permuteIfNWC(epsilon);
        for( long i = tsLength-1; i>= end; i--){
-            INDArray dldaCurrent = epsilon.get(all(), all(), point(i));
+            INDArray dldaCurrent = epsilon.get(all(), all(), point(i)).dup();
            INDArray aCurrent = p.getFirst().get(all(), all(), point(i));
            INDArray zCurrent = p.getSecond().get(all(), all(), point(i));
            INDArray nCurrent = (hasLayerNorm() ? p.getThird().get(all(), all(), point(i)) : null);
@ -141,7 +145,7 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
                //Recurrent weight gradients:
                Nd4j.gemm(aCurrent, dldzNext, rwg, true, false, 1.0, 1.0);
            }
-            INDArray dldzCurrent = a.backprop(zCurrent.dup(), dldaCurrent.dup()).getFirst();
+            INDArray dldzCurrent = a.backprop(zCurrent.dup(), dldaCurrent).getFirst();

            //Handle masking
            INDArray maskCol = null;
@ -200,6 +204,7 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
        }

        epsOut = backpropDropOutIfPresent(epsOut);
+        epsOut = permuteIfNWC(epsOut);
        return new Pair<>(grad, epsOut);
    }

@ -224,6 +229,7 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
        applyDropOutIfNecessary(training, workspaceMgr);

        INDArray input = this.input.castTo(dataType);    //No-op if correct type
+        input = permuteIfNWC(input);
        val m = input.size(0);
        val tsLength = input.size(2);
        val nOut = layerConf().getNOut();
@ -300,7 +306,12 @@ public class SimpleRnn extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.lay
                Nd4j.getExecutioner().exec(new BroadcastMulOp(outZ, mask, outZ, 0, 2));
            }
        }
-
+        if (!forBackprop) {
+            out = permuteIfNWC(out);
+            outZ = permuteIfNWC(outZ);
+            outPreNorm = permuteIfNWC(outPreNorm);
+            recPreNorm = permuteIfNWC(recPreNorm);
+        }
        return new Quad<>(out, outZ, outPreNorm, recPreNorm);
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/TimeDistributedLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/TimeDistributedLayer.java
@ -2,6 +2,7 @@ package org.deeplearning4j.nn.layers.recurrent;

 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
+import org.deeplearning4j.nn.conf.RNNFormat;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer;
 import org.deeplearning4j.nn.workspace.ArrayType;
@ -22,11 +23,11 @@ import org.nd4j.linalg.util.ArrayUtil;
 */
 public class TimeDistributedLayer extends BaseWrapperLayer {

-    private final int timeAxis;
+    private RNNFormat rnnDataFormat;

-    public TimeDistributedLayer(Layer underlying, int timeAxis) {
+    public TimeDistributedLayer(Layer underlying, RNNFormat rnnDataFormat) {
        super(underlying);
-        this.timeAxis = timeAxis;
+        this.rnnDataFormat = rnnDataFormat;
    }


@ -56,7 +57,7 @@ public class TimeDistributedLayer extends BaseWrapperLayer {
    protected INDArray reshape(INDArray array){
        //Reshape the time axis to the minibatch axis
        //For example, for RNN -> FF (dense time distributed): [mb, size, seqLen] -> [mb x seqLen, size]
-        int axis = timeAxis;
+        int axis = (rnnDataFormat == RNNFormat.NCW) ? 2 : 1;
        if(axis < 0)
            axis += array.rank();

@ -91,7 +92,7 @@ public class TimeDistributedLayer extends BaseWrapperLayer {

    protected INDArray revertReshape(INDArray toRevert, long minibatch){

-        int axis = timeAxis;
+        int axis = (rnnDataFormat == RNNFormat.NCW)? 2 : 1;
        if(axis < 0)
            axis += (toRevert.rank()+1);

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/TimeSeriesUtils.java
@ -17,6 +17,13 @@
 package org.deeplearning4j.util;

 import lombok.val;
+import org.deeplearning4j.nn.conf.RNNFormat;
+import org.deeplearning4j.nn.conf.layers.BaseRecurrentLayer;
+import org.deeplearning4j.nn.conf.layers.Layer;
+import org.deeplearning4j.nn.conf.layers.recurrent.Bidirectional;
+import org.deeplearning4j.nn.conf.layers.recurrent.LastTimeStep;
+import org.deeplearning4j.nn.conf.layers.recurrent.TimeDistributed;
+import org.deeplearning4j.nn.conf.layers.util.MaskZeroLayer;
 import org.nd4j.base.Preconditions;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.shape.Shape;
@ -233,6 +240,12 @@ public class TimeSeriesUtils {
        return outReshape.reshape('f', in.size(0), in.size(1), in.size(2));
    }

+    public static INDArray reverseTimeSeries(INDArray in, LayerWorkspaceMgr workspaceMgr, ArrayType arrayType, RNNFormat dataFormat){
+        if (dataFormat == RNNFormat.NCW){
+            return reverseTimeSeries(in, workspaceMgr, arrayType);
+        }
+        return reverseTimeSeries(in.permute(0, 2, 1), workspaceMgr, arrayType).permute(0, 2, 1);
+    }
    /**
     * Reverse an input time series along the time dimension
     *
@ -423,4 +436,25 @@ public class TimeSeriesUtils {

        return new Pair<>(workspaceMgr.leverageTo(arrayType, out), fwdPassTimeSteps);
    }
+
+    /**
+     * Get the {@link RNNFormat} from the RNN layer, accounting for the presence of wrapper layers like Bidirectional,
+     * LastTimeStep, etc
+     * @param layer Layer to get the RNNFormat from
+     */
+    public static RNNFormat getFormatFromRnnLayer(Layer layer){
+        if(layer instanceof BaseRecurrentLayer){
+            return ((BaseRecurrentLayer) layer).getRnnDataFormat();
+        } else if(layer instanceof MaskZeroLayer){
+            return getFormatFromRnnLayer(((MaskZeroLayer) layer).getUnderlying());
+        } else if(layer instanceof Bidirectional){
+            return getFormatFromRnnLayer(((Bidirectional) layer).getFwd());
+        } else if(layer instanceof LastTimeStep){
+            return getFormatFromRnnLayer(((LastTimeStep) layer).getUnderlying());
+        } else if(layer instanceof TimeDistributed){
+            return ((TimeDistributed) layer).getRnnDataFormat();
+        } else {
+            throw new IllegalStateException("Unable to get RNNFormat from layer of type: " + layer);
+        }
+    }
 }