From f40bdcf88511818a36d8c876c1e6f88e83e7a6cb Mon Sep 17 00:00:00 2001 From: Ryan Nett Date: Wed, 28 Aug 2019 20:05:01 -0700 Subject: [PATCH] Dl4j LSTM and Dropout CuDNN fallback and options (#152) * add fallback for Conv layer activation Signed-off-by: Ryan Nett * add fallback and config option for LSTM layers Signed-off-by: Ryan Nett * add fallback option and setting for dropout Signed-off-by: Ryan Nett * fix comments and error messages Signed-off-by: Ryan Nett * move helper fail count to layer instance Signed-off-by: Ryan Nett * ignore helperCountFail for equals and json Signed-off-by: Ryan Nett * typo fix (MLK -> MKL) Signed-off-by: Ryan Nett * add MKLDNN to error messages Signed-off-by: Ryan Nett * add helperAllowFallback to builders, deprecate cudnnAllowFallback Signed-off-by: Ryan Nett --- .../GravesBidirectionalLSTMTest.java | 4 +- .../nn/conf/dropout/Dropout.java | 84 +++++++++++++++++-- .../nn/conf/layers/AbstractLSTM.java | 24 +++++- .../nn/conf/layers/BatchNormalization.java | 17 +++- .../nn/conf/layers/ConvolutionLayer.java | 17 +++- .../conf/layers/GravesBidirectionalLSTM.java | 22 +++++ .../layers/LocalResponseNormalization.java | 17 +++- .../nn/conf/layers/Subsampling3DLayer.java | 17 +++- .../nn/conf/layers/SubsamplingLayer.java | 21 ++++- .../layers/convolution/ConvolutionLayer.java | 28 ++++++- .../nn/layers/mkldnn/BaseMKLDNNHelper.java | 2 +- .../layers/recurrent/BaseRecurrentLayer.java | 2 + .../recurrent/GravesBidirectionalLSTM.java | 17 ++-- .../nn/layers/recurrent/GravesLSTM.java | 8 +- .../nn/layers/recurrent/LSTM.java | 11 ++- .../nn/layers/recurrent/LSTMHelpers.java | 65 +++++++++++--- docs/deeplearning4j/templates/benchmark.md | 2 +- 17 files changed, 302 insertions(+), 56 deletions(-) diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java index 297067862..751b6f6bf 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTMTest.java @@ -186,7 +186,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest { lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, - null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutput; + null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutput; final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(), lstm.input(), @@ -194,7 +194,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest { lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null, - CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutputAsArrays; + CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutputAsArrays; //I have no idea what the heck this does --Ben for (int i = 0; i < timeSeriesLength; i++) { diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java index f9af153ad..ee8bbdc64 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/dropout/Dropout.java @@ -18,6 +18,8 @@ package org.deeplearning4j.nn.conf.dropout; import lombok.Data; import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; import lombok.extern.slf4j.Slf4j; import org.deeplearning4j.nn.workspace.ArrayType; import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr; @@ -26,11 +28,11 @@ import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp; import org.nd4j.linalg.api.ops.random.impl.DropOutInverted; +import org.nd4j.linalg.exception.ND4JOpProfilerException; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.schedule.ISchedule; import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties; import org.nd4j.shade.jackson.annotation.JsonProperty; -import org.nd4j.util.OneTimeLogger; /** * Implements standard (inverted) dropout.
@@ -64,17 +66,29 @@ import org.nd4j.util.OneTimeLogger; * @author Alex Black */ @Data -@JsonIgnoreProperties({"mask", "helper"}) -@EqualsAndHashCode(exclude = {"mask", "helper"}) +@JsonIgnoreProperties({"mask", "helper", "helperCountFail"}) +@EqualsAndHashCode(exclude = {"mask", "helper", "helperCountFail"}) @Slf4j public class Dropout implements IDropout { + /** + * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? + * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * (non-CuDNN) implementation for LSTM/GravesLSTM will be used + * + */ + @Getter + @Setter + protected boolean helperAllowFallback = true; + private double p; private ISchedule pSchedule; private transient INDArray mask; private transient DropoutHelper helper; private boolean initializedHelper = false; + private int helperCountFail = 0; + /** * @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc */ @@ -96,6 +110,18 @@ public class Dropout implements IDropout { this(Double.NaN, activationRetainProbabilitySchedule); } + /** + * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in + * (non-helper) implementation for Dropout will be used + * + * @param allowFallback Whether fallback to non-helper implementation should be used + */ + public Dropout helperAllowFallback(boolean allowFallback) { + this.setHelperAllowFallback(allowFallback); + return this; + } + protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) { this.p = activationRetainProbability; this.pSchedule = activationRetainProbabilitySchedule; @@ -141,9 +167,29 @@ public class Dropout implements IDropout { initializeHelper(output.dataType()); } - if(helper != null){ - helper.applyDropout(inputActivations, output, p); - return output; + if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){ + boolean helperWorked = false; + try { + helper.applyDropout(inputActivations, output, p); + helperWorked = true; + }catch (ND4JOpProfilerException e){ + throw e; //NaN panic etc for debugging + } catch (Exception e){ + if(e.getMessage().contains("Failed to allocate")){ + //This is a memory exception - don't fallback to built-in implementation + throw e; + } + + if(isHelperAllowFallback()){ + helperCountFail++; + log.warn("CuDNN execution failed - falling back on built-in implementation",e); + } else { + throw new RuntimeException("Error during Dropout CuDNN helper forward pass - helperAllowFallback() is set to false", e); + } + } + + if(helperWorked) + return output; } INDArray inputCast = inputActivations; @@ -159,9 +205,29 @@ public class Dropout implements IDropout { @Override public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) { - if(helper != null){ - helper.backprop(gradAtOutput, gradAtInput); - return gradAtInput; + if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){ + boolean helperWorked = false; + try { + helper.backprop(gradAtOutput, gradAtInput); + helperWorked = true; + }catch (ND4JOpProfilerException e){ + throw e; //NaN panic etc for debugging + } catch (Exception e){ + if(e.getMessage().contains("Failed to allocate")){ + //This is a memory exception - don't fallback to built-in implementation + throw e; + } + + if(isHelperAllowFallback()){ + helperCountFail++; + log.warn("CuDNN execution failed - falling back on built-in implementation",e); + } else { + throw new RuntimeException("Error during Dropout CuDNN helper backprop - helperAllowFallback() is set to false", e); + } + } + + if(helperWorked) + return gradAtInput; } Preconditions.checkState(mask != null, "Cannot perform backprop: Dropout mask array is absent (already cleared?)"); diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java index 82bda5647..b051c4b36 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/AbstractLSTM.java @@ -17,8 +17,6 @@ package org.deeplearning4j.nn.conf.layers; import lombok.*; -import org.deeplearning4j.nn.params.LSTMParamInitializer; -import org.deeplearning4j.nn.weights.WeightInit; import org.nd4j.linalg.activations.Activation; import org.nd4j.linalg.activations.IActivation; import org.nd4j.linalg.activations.impl.ActivationSigmoid; @@ -35,11 +33,13 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer { protected double forgetGateBiasInit; protected IActivation gateActivationFn = new ActivationSigmoid(); + protected boolean helperAllowFallback = true; protected AbstractLSTM(Builder builder) { super(builder); this.forgetGateBiasInit = builder.forgetGateBiasInit; this.gateActivationFn = builder.gateActivationFn; + this.helperAllowFallback = builder.helperAllowFallback; } @AllArgsConstructor @@ -60,6 +60,14 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer { */ protected IActivation gateActivationFn = new ActivationSigmoid(); + /** + * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? + * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * (non-CuDNN) implementation for LSTM/GravesLSTM will be used + * + */ + protected boolean helperAllowFallback = true; + /** * Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term * dependencies. @@ -100,6 +108,18 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer { return (T) this; } + /** + * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in + * (non-helper) implementation for LSTM/GravesLSTM will be used + * + * @param allowFallback Whether fallback to non-helper implementation should be used + */ + public T helperAllowFallback(boolean allowFallback) { + this.setHelperAllowFallback(allowFallback); + return (T) this; + } + } } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java index 53c00acac..4c470fec5 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java @@ -428,16 +428,31 @@ public class BatchNormalization extends FeedForwardLayer { /** * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? - * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in * (non-CuDNN) implementation for BatchNormalization will be used * + * @deprecated Use {@link #helperAllowFallback(boolean)} + * * @param allowFallback Whether fallback to non-CuDNN implementation should be used */ + @Deprecated public Builder cudnnAllowFallback(boolean allowFallback) { this.setCudnnAllowFallback(allowFallback); return this; } + /** + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for BatchNormalizationLayer will be used + * + * @param allowFallback Whether fallback to non-CuDNN implementation should be used + */ + public Builder helperAllowFallback(boolean allowFallback) { + this.cudnnAllowFallback = allowFallback; + return this; + } + /** * How should the moving average of variance be stored? Two different parameterizations are supported. * useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java index 3d2e35d24..4fdf1e9cc 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/ConvolutionLayer.java @@ -533,14 +533,29 @@ public class ConvolutionLayer extends FeedForwardLayer { /** * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? - * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in * (non-CuDNN) implementation for ConvolutionLayer will be used * + * @deprecated Use {@link #helperAllowFallback(boolean)} + * * @param allowFallback Whether fallback to non-CuDNN implementation should be used */ + @Deprecated public T cudnnAllowFallback(boolean allowFallback) { this.setCudnnAllowFallback(allowFallback); return (T) this; } + + /** + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for ConvolutionLayer will be used + * + * @param allowFallback Whether fallback to non-CuDNN implementation should be used + */ + public T helperAllowFallback(boolean allowFallback) { + this.cudnnAllowFallback = allowFallback; + return (T) this; + } } } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java index d7aa869a1..1a2a89a24 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.java @@ -53,11 +53,13 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer { private double forgetGateBiasInit; private IActivation gateActivationFn = new ActivationSigmoid(); + protected boolean helperAllowFallback = true; private GravesBidirectionalLSTM(Builder builder) { super(builder); this.forgetGateBiasInit = builder.forgetGateBiasInit; this.gateActivationFn = builder.gateActivationFn; + this.helperAllowFallback = builder.helperAllowFallback; initializeConstraints(builder); } @@ -123,6 +125,14 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer { */ private IActivation gateActivationFn = new ActivationSigmoid(); + /** + * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? + * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * (non-CuDNN) implementation for GravesBidirectionalLSTM will be used + * + */ + protected boolean helperAllowFallback = true; + /** * Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term * dependencies. @@ -163,6 +173,18 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer { return this; } + /** + * When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If false, the built-in + * (non-helper) implementation for GravesBidirectionalLSTM will be used + * + * @param allowFallback Whether fallback to non-helper implementation should be used + */ + public Builder helperAllowFallback(boolean allowFallback) { + this.setHelperAllowFallback(allowFallback); + return (Builder) this; + } + @SuppressWarnings("unchecked") public GravesBidirectionalLSTM build() { return new GravesBidirectionalLSTM(this); diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java index dfc2df9c8..b16703569 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LocalResponseNormalization.java @@ -238,16 +238,31 @@ public class LocalResponseNormalization extends Layer { /** * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? - * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in * (non-CuDNN) implementation for BatchNormalization will be used * + * @deprecated Use {@link #helperAllowFallback(boolean)} + * * @param allowFallback Whether fallback to non-CuDNN implementation should be used */ + @Deprecated public Builder cudnnAllowFallback(boolean allowFallback) { this.setCudnnAllowFallback(allowFallback); return this; } + /** + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for LocalResponseNormalizationLayer will be used + * + * @param allowFallback Whether fallback to non-CuDNN implementation should be used + */ + public Builder helperAllowFallback(boolean allowFallback) { + this.cudnnAllowFallback = allowFallback; + return this; + } + @Override public LocalResponseNormalization build() { return new LocalResponseNormalization(this); diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java index 877e216da..0d0ccba9b 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/Subsampling3DLayer.java @@ -455,15 +455,30 @@ public class Subsampling3DLayer extends NoParamLayer { /** * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? - * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in + * If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in * (non-CuDNN) implementation for ConvolutionLayer will be used * + * @deprecated Use {@link #helperAllowFallback(boolean)} + * * @param allowFallback Whether fallback to non-CuDNN implementation should be used */ + @Deprecated public T cudnnAllowFallback(boolean allowFallback) { this.setCudnnAllowFallback(allowFallback); return (T) this; } + + /** + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for Subsampling3DLayer will be used + * + * @param allowFallback Whether fallback to non-CuDNN implementation should be used + */ + public T helperAllowFallback(boolean allowFallback) { + this.cudnnAllowFallback = allowFallback; + return (T) this; + } } } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java index b2e4df6b8..c20526cf1 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java @@ -480,17 +480,32 @@ public class SubsamplingLayer extends NoParamLayer { } /** - * When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed? - * If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in - * (non-CuDNN) implementation for ConvolutionLayer will be used + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for ConvolutionLayer will be used + * + * @deprecated Use {@link #helperAllowFallback(boolean)} * * @param allowFallback Whether fallback to non-CuDNN implementation should be used */ + @Deprecated public T cudnnAllowFallback(boolean allowFallback) { this.cudnnAllowFallback = allowFallback; return (T) this; } + /** + * When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed? + * If set to false, an exception in the helper will be propagated back to the user. If true, the built-in + * (non-MKL/CuDNN) implementation for SubsamplingLayer will be used + * + * @param allowFallback Whether fallback to non-CuDNN implementation should be used + */ + public T helperAllowFallback(boolean allowFallback) { + this.cudnnAllowFallback = allowFallback; + return (T) this; + } + /** * When doing average pooling, should the padding values be included in the divisor or not?
* Not applicable for max and p-norm pooling.
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java index d6cab0273..75e265b4e 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java @@ -378,7 +378,7 @@ public class ConvolutionLayer extends BaseLayer tBpttStateMap = new ConcurrentHashMap<>(); + protected int helperCountFail = 0; + public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) { super(conf, dataType); } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java index 78e15e167..6fc96dc80 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java @@ -17,7 +17,6 @@ package org.deeplearning4j.nn.layers.recurrent; import lombok.extern.slf4j.Slf4j; -import org.deeplearning4j.nn.api.Layer; import org.deeplearning4j.nn.api.MaskState; import org.deeplearning4j.nn.conf.CacheMode; import org.deeplearning4j.nn.conf.NeuralNetConfiguration; @@ -90,7 +89,8 @@ public class GravesBidirectionalLSTM final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr); - final Pair forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf, + final Pair forwardsGradient = LSTMHelpers.backpropGradientHelper(this, + this.conf, this.layerConf().getGateActivationFn(), this.input, getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon, @@ -98,13 +98,14 @@ public class GravesBidirectionalLSTM GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS, GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true, - null, workspaceMgr); + null, workspaceMgr, layerConf().isHelperAllowFallback()); final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr); - final Pair backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf, + final Pair backwardsGradient = LSTMHelpers.backpropGradientHelper(this, + this.conf, this.layerConf().getGateActivationFn(), this.input, getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS), getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon, @@ -112,7 +113,7 @@ public class GravesBidirectionalLSTM GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS, GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true, - null, workspaceMgr); + null, workspaceMgr, layerConf().isHelperAllowFallback()); //merge the gradient, which is key value pair of String,INDArray @@ -175,7 +176,7 @@ public class GravesBidirectionalLSTM getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null, forBackprop || (cacheMode != CacheMode.NONE && training), true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null, - forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr); + forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback()); backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input, @@ -184,7 +185,7 @@ public class GravesBidirectionalLSTM getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null, forBackprop || (cacheMode != CacheMode.NONE && training), false, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null, - forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr); + forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback()); cachedPassForward = forwardsEval; cachedPassBackward = backwardsEval; @@ -230,7 +231,7 @@ public class GravesBidirectionalLSTM return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input, getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training, prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true, - null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr); + null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback()); } } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java index a2f38b324..13f30b8bb 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java @@ -17,7 +17,6 @@ package org.deeplearning4j.nn.layers.recurrent; import lombok.extern.slf4j.Slf4j; -import org.deeplearning4j.nn.api.Layer; import org.deeplearning4j.nn.api.MaskState; import org.deeplearning4j.nn.conf.CacheMode; import org.deeplearning4j.nn.conf.NeuralNetConfiguration; @@ -92,11 +91,12 @@ public class GravesLSTM extends BaseRecurrentLayer p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input, + Pair p = LSTMHelpers.backpropGradientHelper(this, + this.conf, this.layerConf().getGateActivationFn(), this.input, recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true, GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY, GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null, - workspaceMgr); + workspaceMgr, layerConf().isHelperAllowFallback()); weightNoiseParams.clear(); p.setSecond(backpropDropOutIfPresent(p.getSecond())); @@ -141,7 +141,7 @@ public class GravesLSTM extends BaseRecurrentLayer p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input, + Pair p = LSTMHelpers.backpropGradientHelper(this, + this.conf, this.layerConf().getGateActivationFn(), this.input, recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true, LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY, - LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr); + LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr, + layerConf().isHelperAllowFallback()); weightNoiseParams.clear(); p.setSecond(backpropDropOutIfPresent(p.getSecond())); @@ -161,7 +160,7 @@ public class LSTM extends BaseRecurrentLayer backpropGradientHelper(final NeuralNetConfiguration conf, + static public Pair backpropGradientHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf, final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG] final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg] final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength, @@ -433,7 +453,8 @@ public class LSTMHelpers { final Map gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM final LSTMHelper helper, - final LayerWorkspaceMgr workspaceMgr) { + final LayerWorkspaceMgr workspaceMgr, + final boolean isHelperAllowFallback) { input = input.castTo(inputWeights.dataType()); //No-op if @@ -496,11 +517,29 @@ public class LSTMHelpers { rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0)); } - if (helper != null) { - Pair ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights, - inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards, - inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray, - hasPeepholeConnections, workspaceMgr); + if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) { + Pair ret = null; + try { + ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights, + inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards, + inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray, + hasPeepholeConnections, workspaceMgr); + }catch (ND4JOpProfilerException e){ + throw e; //NaN panic etc for debugging + } catch (Exception e){ + if(e.getMessage().contains("Failed to allocate")){ + //This is a memory exception - don't fallback to built-in implementation + throw e; + } + + if(isHelperAllowFallback){ + layer.helperCountFail++; + log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e); + } else { + throw new RuntimeException("Error during LSTM MKL/CuDNN helper backprop - helperAllowFallback() is set to false", e); + } + } + if (ret != null) { return ret; } diff --git a/docs/deeplearning4j/templates/benchmark.md b/docs/deeplearning4j/templates/benchmark.md index 330ff99a6..93e30fda9 100644 --- a/docs/deeplearning4j/templates/benchmark.md +++ b/docs/deeplearning4j/templates/benchmark.md @@ -45,7 +45,7 @@ Ideally, these should be excluded from any timing/performance results you report For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN? ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably. -This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MLK) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN. +This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MKL) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN. 3. How are things configured?