Dl4j LSTM and Dropout CuDNN fallback and options (#152)

* add fallback for Conv layer activation Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback and config option for LSTM layers Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback option and setting for dropout Signed-off-by: Ryan Nett <rnett@skymind.io> * fix comments and error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * move helper fail count to layer instance Signed-off-by: Ryan Nett <rnett@skymind.io> * ignore helperCountFail for equals and json Signed-off-by: Ryan Nett <rnett@skymind.io> * typo fix (MLK -> MKL) Signed-off-by: Ryan Nett <rnett@skymind.io> * add MKLDNN to error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * add helperAllowFallback to builders, deprecate cudnnAllowFallback Signed-off-by: Ryan Nett <rnett@skymind.io>
2019-08-28 20:05:01 -07:00 · 2019-08-28 20:05:01 -07:00 · f40bdcf885
commit f40bdcf885
parent 70af8c2afc
17 changed files with 302 additions and 56 deletions
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java
@ -186,7 +186,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
                        lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
                        lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
                        false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true,
-                        null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutput;
+                        null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutput;

        final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(),
                        lstm.input(),
@ -194,7 +194,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
                        lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
                        lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
                        true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null,
-                        CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutputAsArrays;
+                        CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutputAsArrays;

        //I have no idea what the heck this does --Ben
        for (int i = 0; i < timeSeriesLength; i++) {
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java
@ -18,6 +18,8 @@ package org.deeplearning4j.nn.conf.dropout;

 import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.workspace.ArrayType;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
@ -26,11 +28,11 @@ import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
 import org.nd4j.linalg.api.ops.random.impl.DropOutInverted;
+import org.nd4j.linalg.exception.ND4JOpProfilerException;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.schedule.ISchedule;
 import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
 import org.nd4j.shade.jackson.annotation.JsonProperty;
-import org.nd4j.util.OneTimeLogger;

 /**
 * Implements standard (inverted) dropout.<br>
@ -64,17 +66,29 @@ import org.nd4j.util.OneTimeLogger;
 * @author Alex Black
 */
@Data
-@JsonIgnoreProperties({"mask", "helper"})
-@EqualsAndHashCode(exclude = {"mask", "helper"})
+@JsonIgnoreProperties({"mask", "helper", "helperCountFail"})
+@EqualsAndHashCode(exclude = {"mask", "helper", "helperCountFail"})
@Slf4j
 public class Dropout implements IDropout {

+    /**
+     * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
+     * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+     * (non-CuDNN) implementation for LSTM/GravesLSTM will be used
+     *
+     */
+    @Getter
+    @Setter
+    protected boolean helperAllowFallback = true;
+
    private double p;
    private ISchedule pSchedule;
    private transient INDArray mask;
    private transient DropoutHelper helper;
    private boolean initializedHelper = false;

+    private int helperCountFail = 0;
+
    /**
     * @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc
     */
@ -96,6 +110,18 @@ public class Dropout implements IDropout {
        this(Double.NaN, activationRetainProbabilitySchedule);
    }

+    /**
+     * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
+     * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
+     * (non-helper) implementation for Dropout will be used
+     *
+     * @param allowFallback Whether fallback to non-helper implementation should be used
+     */
+    public Dropout helperAllowFallback(boolean allowFallback) {
+        this.setHelperAllowFallback(allowFallback);
+        return this;
+    }
+
    protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) {
        this.p = activationRetainProbability;
        this.pSchedule = activationRetainProbabilitySchedule;
@ -141,9 +167,29 @@ public class Dropout implements IDropout {
            initializeHelper(output.dataType());
        }

-        if(helper != null){
-            helper.applyDropout(inputActivations, output, p);
-            return output;
+        if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
+            boolean helperWorked = false;
+            try {
+                helper.applyDropout(inputActivations, output, p);
+                helperWorked = true;
+            }catch (ND4JOpProfilerException e){
+                throw e;    //NaN panic etc for debugging
+            } catch (Exception e){
+                if(e.getMessage().contains("Failed to allocate")){
+                    //This is a memory exception - don't fallback to built-in implementation
+                    throw e;
+                }
+
+                if(isHelperAllowFallback()){
+                    helperCountFail++;
+                    log.warn("CuDNN execution failed - falling back on built-in implementation",e);
+                } else {
+                    throw new RuntimeException("Error during Dropout CuDNN helper forward pass - helperAllowFallback() is set to false", e);
+                }
+            }
+
+            if(helperWorked)
+                return output;
        }

        INDArray inputCast = inputActivations;
@ -159,9 +205,29 @@ public class Dropout implements IDropout {

    @Override
    public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) {
-        if(helper != null){
-            helper.backprop(gradAtOutput, gradAtInput);
-            return gradAtInput;
+        if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
+            boolean helperWorked = false;
+            try {
+                helper.backprop(gradAtOutput, gradAtInput);
+                helperWorked = true;
+            }catch (ND4JOpProfilerException e){
+                throw e;    //NaN panic etc for debugging
+            } catch (Exception e){
+                if(e.getMessage().contains("Failed to allocate")){
+                    //This is a memory exception - don't fallback to built-in implementation
+                    throw e;
+                }
+
+                if(isHelperAllowFallback()){
+                    helperCountFail++;
+                    log.warn("CuDNN execution failed - falling back on built-in implementation",e);
+                } else {
+                    throw new RuntimeException("Error during Dropout CuDNN helper backprop - helperAllowFallback() is set to false", e);
+                }
+            }
+
+            if(helperWorked)
+                return gradAtInput;
        }

        Preconditions.checkState(mask != null, "Cannot perform backprop: Dropout mask array is absent (already cleared?)");
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java
@ -17,8 +17,6 @@
 package org.deeplearning4j.nn.conf.layers;

 import lombok.*;
-import org.deeplearning4j.nn.params.LSTMParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.activations.IActivation;
 import org.nd4j.linalg.activations.impl.ActivationSigmoid;
@ -35,11 +33,13 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {

    protected double forgetGateBiasInit;
    protected IActivation gateActivationFn = new ActivationSigmoid();
+    protected boolean helperAllowFallback = true;

    protected AbstractLSTM(Builder builder) {
        super(builder);
        this.forgetGateBiasInit = builder.forgetGateBiasInit;
        this.gateActivationFn = builder.gateActivationFn;
+        this.helperAllowFallback = builder.helperAllowFallback;
    }

    @AllArgsConstructor
@ -60,6 +60,14 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
         */
        protected IActivation gateActivationFn = new ActivationSigmoid();

+        /**
+         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
+         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * (non-CuDNN) implementation for LSTM/GravesLSTM will be used
+         *
+         */
+        protected boolean helperAllowFallback = true;
+
        /**
         * Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
         * dependencies.
@ -100,6 +108,18 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
            return (T) this;
        }

+        /**
+         * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
+         * (non-helper) implementation for LSTM/GravesLSTM will be used
+         *
+         * @param allowFallback Whether fallback to non-helper implementation should be used
+         */
+        public T helperAllowFallback(boolean allowFallback) {
+            this.setHelperAllowFallback(allowFallback);
+            return (T) this;
+        }
+
    }

 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java
@ -428,16 +428,31 @@ public class BatchNormalization extends FeedForwardLayer {

        /**
         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
-         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
         * (non-CuDNN) implementation for BatchNormalization will be used
         *
+         * @deprecated Use {@link #helperAllowFallback(boolean)}
+         *
         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
         */
+        @Deprecated
        public Builder cudnnAllowFallback(boolean allowFallback) {
            this.setCudnnAllowFallback(allowFallback);
            return this;
        }

+        /**
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for BatchNormalizationLayer will be used
+         *
+         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
+         */
+        public Builder helperAllowFallback(boolean allowFallback) {
+            this.cudnnAllowFallback = allowFallback;
+            return this;
+        }
+
        /**
         * How should the moving average of variance be stored? Two different parameterizations are supported.
         * useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java
@ -533,14 +533,29 @@ public class ConvolutionLayer extends FeedForwardLayer {

        /**
         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
-         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
         * (non-CuDNN) implementation for ConvolutionLayer will be used
         *
+         * @deprecated Use {@link #helperAllowFallback(boolean)}
+         *
         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
         */
+        @Deprecated
        public T cudnnAllowFallback(boolean allowFallback) {
            this.setCudnnAllowFallback(allowFallback);
            return (T) this;
        }
+
+        /**
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
+         *
+         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
+         */
+        public T helperAllowFallback(boolean allowFallback) {
+            this.cudnnAllowFallback = allowFallback;
+            return (T) this;
+        }
    }
 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java
@ -53,11 +53,13 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {

    private double forgetGateBiasInit;
    private IActivation gateActivationFn = new ActivationSigmoid();
+    protected boolean helperAllowFallback = true;

    private GravesBidirectionalLSTM(Builder builder) {
        super(builder);
        this.forgetGateBiasInit = builder.forgetGateBiasInit;
        this.gateActivationFn = builder.gateActivationFn;
+        this.helperAllowFallback = builder.helperAllowFallback;

        initializeConstraints(builder);
    }
@ -123,6 +125,14 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
         */
        private IActivation gateActivationFn = new ActivationSigmoid();

+        /**
+         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
+         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * (non-CuDNN) implementation for GravesBidirectionalLSTM will be used
+         *
+         */
+        protected boolean helperAllowFallback = true;
+
        /**
         * Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
         * dependencies.
@ -163,6 +173,18 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
            return this;
        }

+        /**
+         * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
+         * (non-helper) implementation for GravesBidirectionalLSTM will be used
+         *
+         * @param allowFallback Whether fallback to non-helper implementation should be used
+         */
+        public Builder helperAllowFallback(boolean allowFallback) {
+            this.setHelperAllowFallback(allowFallback);
+            return (Builder) this;
+        }
+
        @SuppressWarnings("unchecked")
        public GravesBidirectionalLSTM build() {
            return new GravesBidirectionalLSTM(this);
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java
@ -238,16 +238,31 @@ public class LocalResponseNormalization extends Layer {

        /**
         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
-         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
         * (non-CuDNN) implementation for BatchNormalization will be used
         *
+         * @deprecated Use {@link #helperAllowFallback(boolean)}
+         *
         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
         */
+        @Deprecated
        public Builder cudnnAllowFallback(boolean allowFallback) {
            this.setCudnnAllowFallback(allowFallback);
            return this;
        }

+        /**
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for LocalResponseNormalizationLayer will be used
+         *
+         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
+         */
+        public Builder helperAllowFallback(boolean allowFallback) {
+            this.cudnnAllowFallback = allowFallback;
+            return this;
+        }
+
        @Override
        public LocalResponseNormalization build() {
            return new LocalResponseNormalization(this);
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java
@ -455,15 +455,30 @@ public class Subsampling3DLayer extends NoParamLayer {

        /**
         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
-         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
+         * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
         * (non-CuDNN) implementation for ConvolutionLayer will be used
         *
+         * @deprecated Use {@link #helperAllowFallback(boolean)}
+         *
         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
         */
+        @Deprecated
        public T cudnnAllowFallback(boolean allowFallback) {
            this.setCudnnAllowFallback(allowFallback);
            return (T) this;
        }
+
+        /**
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for Subsampling3DLayer will be used
+         *
+         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
+         */
+        public T helperAllowFallback(boolean allowFallback) {
+            this.cudnnAllowFallback = allowFallback;
+            return (T) this;
+        }
    }

 }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java
@ -480,17 +480,32 @@ public class SubsamplingLayer extends NoParamLayer {
        }

        /**
-         * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
-         * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
-         * (non-CuDNN) implementation for ConvolutionLayer will be used
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
+         *
+         * @deprecated Use {@link #helperAllowFallback(boolean)}
         *
         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
         */
+        @Deprecated
        public T cudnnAllowFallback(boolean allowFallback) {
            this.cudnnAllowFallback = allowFallback;
            return (T) this;
        }

+        /**
+         * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
+         * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
+         * (non-MKL/CuDNN) implementation for SubsamplingLayer will be used
+         *
+         * @param allowFallback Whether fallback to non-CuDNN implementation should be used
+         */
+        public T helperAllowFallback(boolean allowFallback) {
+            this.cudnnAllowFallback = allowFallback;
+            return (T) this;
+        }
+
        /**
         * When doing average pooling, should the padding values be included in the divisor or not?<br>
         * Not applicable for max and p-norm pooling.<br>
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java
@ -378,7 +378,7 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
                        log.warn("CuDNN execution failed - falling back on built-in implementation",e);
                    }
                } else {
-                    throw new RuntimeException(e);
+                    throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
                }
            }
            if (ret != null) {
@ -453,8 +453,30 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
        //String afn = conf.getLayer().getActivationFunction();
        IActivation afn = layerConf().getActivationFn();

-        if (helper != null && Shape.strideDescendingCAscendingF(z)) {
-            INDArray ret = helper.activate(z, layerConf().getActivationFn(), training);
+        if (helper != null && Shape.strideDescendingCAscendingF(z) && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
+            INDArray ret = null;
+            try {
+                ret = helper.activate(z, layerConf().getActivationFn(), training);
+            } catch (ND4JOpProfilerException e){
+                throw e;    //NaN panic etc for debugging
+            } catch (Exception e) {
+                if (e.getMessage() != null && e.getMessage().contains("Failed to allocate")) {
+                    //This is a memory exception - don't fallback to built-in implementation
+                    throw e;
+                }
+
+                if (layerConf().isCudnnAllowFallback()) {
+                    helperCountFail++;
+                    if (helper instanceof MKLDNNConvHelper) {
+                        log.warn("MKL-DNN execution failed - falling back on built-in implementation", e);
+                    } else {
+                        log.warn("CuDNN execution failed - falling back on built-in implementation", e);
+                    }
+                } else {
+                    throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
+                }
+            }
+
            if (ret != null) {
                return ret;
            }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/BaseMKLDNNHelper.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/BaseMKLDNNHelper.java
@ -22,7 +22,7 @@ import java.lang.reflect.Method;
 import java.util.concurrent.atomic.AtomicBoolean;

 /**
- * Base class for MLK-DNN Helpers
+ * Base class for MKL-DNN Helpers
 * @author Alex Black
 */
 public class BaseMKLDNNHelper {
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BaseRecurrentLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/BaseRecurrentLayer.java
@ -41,6 +41,8 @@ public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.n
     */
    protected Map<String, INDArray> tBpttStateMap = new ConcurrentHashMap<>();

+    protected int helperCountFail = 0;
+
    public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) {
        super(conf, dataType);
    }
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
@ -17,7 +17,6 @@
 package org.deeplearning4j.nn.layers.recurrent;

 import lombok.extern.slf4j.Slf4j;
-import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -90,7 +89,8 @@ public class GravesBidirectionalLSTM

        final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);

-        final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
+        final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this,
+                        this.conf,
                        this.layerConf().getGateActivationFn(), this.input,
                        getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
@ -98,13 +98,14 @@ public class GravesBidirectionalLSTM
                        GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
                        GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
                        GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true,
-                        null, workspaceMgr);
+                        null, workspaceMgr, layerConf().isHelperAllowFallback());



        final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr);

-        final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
+        final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this,
+                        this.conf,
                        this.layerConf().getGateActivationFn(), this.input,
                        getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
                        getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
@ -112,7 +113,7 @@ public class GravesBidirectionalLSTM
                        GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
                        GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
                        GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
-                        null, workspaceMgr);
+                        null, workspaceMgr, layerConf().isHelperAllowFallback());


        //merge the gradient, which is key value pair of String,INDArray
@ -175,7 +176,7 @@ public class GravesBidirectionalLSTM
                            getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
                            forBackprop || (cacheMode != CacheMode.NONE && training), true,
                            GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null,
-                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
+                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

            backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
                            this.input,
@ -184,7 +185,7 @@ public class GravesBidirectionalLSTM
                            getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
                            forBackprop || (cacheMode != CacheMode.NONE && training), false,
                            GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
-                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
+                            forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

            cachedPassForward = forwardsEval;
            cachedPassBackward = backwardsEval;
@ -230,7 +231,7 @@ public class GravesBidirectionalLSTM
            return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
                            getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
                            prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
-                            null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
+                            null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
        }
    }

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
@ -17,7 +17,6 @@
 package org.deeplearning4j.nn.layers.recurrent;

 import lombok.extern.slf4j.Slf4j;
-import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -92,11 +91,12 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
        }


-        Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
+        Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this,
+                        this.conf, this.layerConf().getGateActivationFn(), this.input,
                        recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
                        GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
                        GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
-                        workspaceMgr);
+                        workspaceMgr, layerConf().isHelperAllowFallback());

        weightNoiseParams.clear();
        p.setSecond(backpropDropOutIfPresent(p.getSecond()));
@ -141,7 +141,7 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
                        this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
                        prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
                        GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
-                        cacheMode, workspaceMgr);
+                        cacheMode, workspaceMgr, layerConf().isHelperAllowFallback());


        if (training && cacheMode != CacheMode.NONE) {
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
@ -17,7 +17,6 @@
 package org.deeplearning4j.nn.layers.recurrent;

 import lombok.extern.slf4j.Slf4j;
-import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.CacheMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -32,8 +31,6 @@ import org.nd4j.linalg.primitives.Pair;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.nd4j.util.OneTimeLogger;

-import java.util.Properties;
-
 /**
 * LSTM layer implementation.
 *
@ -116,10 +113,12 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
        }


-        Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
+        Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this,
+                        this.conf, this.layerConf().getGateActivationFn(), this.input,
                        recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
                        LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
-                        LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr);
+                        LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr,
+                        layerConf().isHelperAllowFallback());

        weightNoiseParams.clear();
        p.setSecond(backpropDropOutIfPresent(p.getSecond()));
@ -161,7 +160,7 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
                        this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
                        prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
                        LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
-                        forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
+                        forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());

        if (training && cacheMode != CacheMode.NONE) {
            cachedFwdPass = fwd;
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
@ -29,6 +29,7 @@ import org.deeplearning4j.nn.conf.memory.MemoryReport;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.BaseLayer;
+import org.deeplearning4j.nn.layers.mkldnn.MKLDNNConvHelper;
 import org.deeplearning4j.nn.workspace.ArrayType;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.nd4j.linalg.activations.IActivation;
@ -38,6 +39,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
 import org.nd4j.linalg.api.ops.impl.transforms.same.TimesOneMinus;
 import org.nd4j.linalg.api.shape.Shape;
+import org.nd4j.linalg.exception.ND4JOpProfilerException;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.NDArrayIndex;
 import org.nd4j.linalg.primitives.Pair;
@ -81,7 +83,7 @@ public class LSTMHelpers {
     * Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
     * and rnnTimeStep whilst being reasonably efficient for all
     */
-    static public FwdPassReturn activateHelper(final BaseLayer layer, final NeuralNetConfiguration conf,
+    static public FwdPassReturn activateHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
                                               final IActivation gateActivationFn, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
                                               INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
                                               final INDArray originalInputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
@ -91,7 +93,7 @@ public class LSTMHelpers {
                                               final String inputWeightKey, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
                                               final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
                                               final LSTMHelper helper, final CacheMode cacheMode, // cacheMode for layer calling this helper
-                                               final LayerWorkspaceMgr workspaceMgr
+                                               final LayerWorkspaceMgr workspaceMgr, boolean isHelperAllowFallback
                                               ) {

        //Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
@ -198,10 +200,28 @@ public class LSTMHelpers {
            prevOutputActivations = Nd4j.zeros(input.dataType(), new long[] {miniBatchSize, hiddenLayerSize});
        }

-        if (helper != null) {
-            FwdPassReturn ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
-                            biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
-                            inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
+        if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
+            FwdPassReturn ret = null;
+            try {
+                ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
+                        biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
+                        inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
+            }catch (ND4JOpProfilerException e){
+                throw e;    //NaN panic etc for debugging
+            } catch (Exception e){
+                if(e.getMessage().contains("Failed to allocate")){
+                    //This is a memory exception - don't fallback to built-in implementation
+                    throw e;
+                }
+
+                if(isHelperAllowFallback){
+                    layer.helperCountFail++;
+                    log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
+                } else {
+                    throw new RuntimeException("Error during LSTM MKL/CuDNN helper forward pass - helperAllowFallback() is set to false", e);
+                }
+            }
+
            if (ret != null) {
                return ret;
            }
@ -424,7 +444,7 @@ public class LSTMHelpers {
        }
    }

-    static public Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf,
+    static public Pair<Gradient, INDArray> backpropGradientHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
                    final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
                    final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
                    final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength,
@ -433,7 +453,8 @@ public class LSTMHelpers {
                    final Map<String, INDArray> gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
                    final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
                    final LSTMHelper helper,
-                    final LayerWorkspaceMgr workspaceMgr) {
+                    final LayerWorkspaceMgr workspaceMgr,
+                    final boolean isHelperAllowFallback) {

        input = input.castTo(inputWeights.dataType());  //No-op if

@ -496,11 +517,29 @@ public class LSTMHelpers {
            rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0));
        }

-        if (helper != null) {
-            Pair<Gradient, INDArray> ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
-                            inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
-                            inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
-                            hasPeepholeConnections, workspaceMgr);
+        if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
+            Pair<Gradient, INDArray> ret = null;
+            try {
+                ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
+                        inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
+                        inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
+                        hasPeepholeConnections, workspaceMgr);
+            }catch (ND4JOpProfilerException e){
+                throw e;    //NaN panic etc for debugging
+            } catch (Exception e){
+                if(e.getMessage().contains("Failed to allocate")){
+                    //This is a memory exception - don't fallback to built-in implementation
+                    throw e;
+                }
+
+                if(isHelperAllowFallback){
+                    layer.helperCountFail++;
+                    log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
+                } else {
+                    throw new RuntimeException("Error during LSTM MKL/CuDNN helper backprop - helperAllowFallback() is set to false", e);
+                }
+            }
+
            if (ret != null) {
                return ret;
            }
--- a/docs/deeplearning4j/templates/benchmark.md
+++ b/docs/deeplearning4j/templates/benchmark.md
@ -45,7 +45,7 @@ Ideally, these should be excluded from any timing/performance results you report
 For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN?
 ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably.

-This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MLK) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
+This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MKL) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.


 3. How are things configured?