Dl4j LSTM and Dropout CuDNN fallback and options (#152)

* add fallback for Conv layer activation

Signed-off-by: Ryan Nett <rnett@skymind.io>

* add fallback and config option for LSTM layers

Signed-off-by: Ryan Nett <rnett@skymind.io>

* add fallback option and setting for dropout

Signed-off-by: Ryan Nett <rnett@skymind.io>

* fix comments and error messages

Signed-off-by: Ryan Nett <rnett@skymind.io>

* move helper fail count to layer instance

Signed-off-by: Ryan Nett <rnett@skymind.io>

* ignore helperCountFail for equals and json

Signed-off-by: Ryan Nett <rnett@skymind.io>

* typo fix (MLK -> MKL)

Signed-off-by: Ryan Nett <rnett@skymind.io>

* add MKLDNN to error messages

Signed-off-by: Ryan Nett <rnett@skymind.io>

* add helperAllowFallback to builders, deprecate cudnnAllowFallback

Signed-off-by: Ryan Nett <rnett@skymind.io>
master
Ryan Nett 2019-08-28 20:05:01 -07:00 committed by Alex Black
parent 70af8c2afc
commit f40bdcf885
17 changed files with 302 additions and 56 deletions

View File

@ -186,7 +186,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true,
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutput;
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutput;
final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(),
lstm.input(),
@ -194,7 +194,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null,
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutputAsArrays;
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutputAsArrays;
//I have no idea what the heck this does --Ben
for (int i = 0; i < timeSeriesLength; i++) {

View File

@ -18,6 +18,8 @@ package org.deeplearning4j.nn.conf.dropout;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.nn.workspace.ArrayType;
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
@ -26,11 +28,11 @@ import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
import org.nd4j.linalg.api.ops.random.impl.DropOutInverted;
import org.nd4j.linalg.exception.ND4JOpProfilerException;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.schedule.ISchedule;
import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
import org.nd4j.shade.jackson.annotation.JsonProperty;
import org.nd4j.util.OneTimeLogger;
/**
* Implements standard (inverted) dropout.<br>
@ -64,17 +66,29 @@ import org.nd4j.util.OneTimeLogger;
* @author Alex Black
*/
@Data
@JsonIgnoreProperties({"mask", "helper"})
@EqualsAndHashCode(exclude = {"mask", "helper"})
@JsonIgnoreProperties({"mask", "helper", "helperCountFail"})
@EqualsAndHashCode(exclude = {"mask", "helper", "helperCountFail"})
@Slf4j
public class Dropout implements IDropout {
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
*
*/
@Getter
@Setter
protected boolean helperAllowFallback = true;
private double p;
private ISchedule pSchedule;
private transient INDArray mask;
private transient DropoutHelper helper;
private boolean initializedHelper = false;
private int helperCountFail = 0;
/**
* @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc
*/
@ -96,6 +110,18 @@ public class Dropout implements IDropout {
this(Double.NaN, activationRetainProbabilitySchedule);
}
/**
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
* (non-helper) implementation for Dropout will be used
*
* @param allowFallback Whether fallback to non-helper implementation should be used
*/
public Dropout helperAllowFallback(boolean allowFallback) {
this.setHelperAllowFallback(allowFallback);
return this;
}
protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) {
this.p = activationRetainProbability;
this.pSchedule = activationRetainProbabilitySchedule;
@ -141,9 +167,29 @@ public class Dropout implements IDropout {
initializeHelper(output.dataType());
}
if(helper != null){
helper.applyDropout(inputActivations, output, p);
return output;
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
boolean helperWorked = false;
try {
helper.applyDropout(inputActivations, output, p);
helperWorked = true;
}catch (ND4JOpProfilerException e){
throw e; //NaN panic etc for debugging
} catch (Exception e){
if(e.getMessage().contains("Failed to allocate")){
//This is a memory exception - don't fallback to built-in implementation
throw e;
}
if(isHelperAllowFallback()){
helperCountFail++;
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
} else {
throw new RuntimeException("Error during Dropout CuDNN helper forward pass - helperAllowFallback() is set to false", e);
}
}
if(helperWorked)
return output;
}
INDArray inputCast = inputActivations;
@ -159,9 +205,29 @@ public class Dropout implements IDropout {
@Override
public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) {
if(helper != null){
helper.backprop(gradAtOutput, gradAtInput);
return gradAtInput;
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
boolean helperWorked = false;
try {
helper.backprop(gradAtOutput, gradAtInput);
helperWorked = true;
}catch (ND4JOpProfilerException e){
throw e; //NaN panic etc for debugging
} catch (Exception e){
if(e.getMessage().contains("Failed to allocate")){
//This is a memory exception - don't fallback to built-in implementation
throw e;
}
if(isHelperAllowFallback()){
helperCountFail++;
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
} else {
throw new RuntimeException("Error during Dropout CuDNN helper backprop - helperAllowFallback() is set to false", e);
}
}
if(helperWorked)
return gradAtInput;
}
Preconditions.checkState(mask != null, "Cannot perform backprop: Dropout mask array is absent (already cleared?)");

View File

@ -17,8 +17,6 @@
package org.deeplearning4j.nn.conf.layers;
import lombok.*;
import org.deeplearning4j.nn.params.LSTMParamInitializer;
import org.deeplearning4j.nn.weights.WeightInit;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.activations.IActivation;
import org.nd4j.linalg.activations.impl.ActivationSigmoid;
@ -35,11 +33,13 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
protected double forgetGateBiasInit;
protected IActivation gateActivationFn = new ActivationSigmoid();
protected boolean helperAllowFallback = true;
protected AbstractLSTM(Builder builder) {
super(builder);
this.forgetGateBiasInit = builder.forgetGateBiasInit;
this.gateActivationFn = builder.gateActivationFn;
this.helperAllowFallback = builder.helperAllowFallback;
}
@AllArgsConstructor
@ -60,6 +60,14 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
*/
protected IActivation gateActivationFn = new ActivationSigmoid();
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
*
*/
protected boolean helperAllowFallback = true;
/**
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
* dependencies.
@ -100,6 +108,18 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
return (T) this;
}
/**
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
* (non-helper) implementation for LSTM/GravesLSTM will be used
*
* @param allowFallback Whether fallback to non-helper implementation should be used
*/
public T helperAllowFallback(boolean allowFallback) {
this.setHelperAllowFallback(allowFallback);
return (T) this;
}
}
}

View File

@ -428,16 +428,31 @@ public class BatchNormalization extends FeedForwardLayer {
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
* (non-CuDNN) implementation for BatchNormalization will be used
*
* @deprecated Use {@link #helperAllowFallback(boolean)}
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
@Deprecated
public Builder cudnnAllowFallback(boolean allowFallback) {
this.setCudnnAllowFallback(allowFallback);
return this;
}
/**
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for BatchNormalizationLayer will be used
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
public Builder helperAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return this;
}
/**
* How should the moving average of variance be stored? Two different parameterizations are supported.
* useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as

View File

@ -533,14 +533,29 @@ public class ConvolutionLayer extends FeedForwardLayer {
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
* (non-CuDNN) implementation for ConvolutionLayer will be used
*
* @deprecated Use {@link #helperAllowFallback(boolean)}
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
@Deprecated
public T cudnnAllowFallback(boolean allowFallback) {
this.setCudnnAllowFallback(allowFallback);
return (T) this;
}
/**
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
public T helperAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return (T) this;
}
}
}

View File

@ -53,11 +53,13 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
private double forgetGateBiasInit;
private IActivation gateActivationFn = new ActivationSigmoid();
protected boolean helperAllowFallback = true;
private GravesBidirectionalLSTM(Builder builder) {
super(builder);
this.forgetGateBiasInit = builder.forgetGateBiasInit;
this.gateActivationFn = builder.gateActivationFn;
this.helperAllowFallback = builder.helperAllowFallback;
initializeConstraints(builder);
}
@ -123,6 +125,14 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
*/
private IActivation gateActivationFn = new ActivationSigmoid();
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* (non-CuDNN) implementation for GravesBidirectionalLSTM will be used
*
*/
protected boolean helperAllowFallback = true;
/**
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
* dependencies.
@ -163,6 +173,18 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
return this;
}
/**
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
* (non-helper) implementation for GravesBidirectionalLSTM will be used
*
* @param allowFallback Whether fallback to non-helper implementation should be used
*/
public Builder helperAllowFallback(boolean allowFallback) {
this.setHelperAllowFallback(allowFallback);
return (Builder) this;
}
@SuppressWarnings("unchecked")
public GravesBidirectionalLSTM build() {
return new GravesBidirectionalLSTM(this);

View File

@ -238,16 +238,31 @@ public class LocalResponseNormalization extends Layer {
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
* (non-CuDNN) implementation for BatchNormalization will be used
*
* @deprecated Use {@link #helperAllowFallback(boolean)}
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
@Deprecated
public Builder cudnnAllowFallback(boolean allowFallback) {
this.setCudnnAllowFallback(allowFallback);
return this;
}
/**
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for LocalResponseNormalizationLayer will be used
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
public Builder helperAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return this;
}
@Override
public LocalResponseNormalization build() {
return new LocalResponseNormalization(this);

View File

@ -455,15 +455,30 @@ public class Subsampling3DLayer extends NoParamLayer {
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
* (non-CuDNN) implementation for ConvolutionLayer will be used
*
* @deprecated Use {@link #helperAllowFallback(boolean)}
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
@Deprecated
public T cudnnAllowFallback(boolean allowFallback) {
this.setCudnnAllowFallback(allowFallback);
return (T) this;
}
/**
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for Subsampling3DLayer will be used
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
public T helperAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return (T) this;
}
}
}

View File

@ -480,17 +480,32 @@ public class SubsamplingLayer extends NoParamLayer {
}
/**
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
* (non-CuDNN) implementation for ConvolutionLayer will be used
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
*
* @deprecated Use {@link #helperAllowFallback(boolean)}
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
@Deprecated
public T cudnnAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return (T) this;
}
/**
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
* (non-MKL/CuDNN) implementation for SubsamplingLayer will be used
*
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
*/
public T helperAllowFallback(boolean allowFallback) {
this.cudnnAllowFallback = allowFallback;
return (T) this;
}
/**
* When doing average pooling, should the padding values be included in the divisor or not?<br>
* Not applicable for max and p-norm pooling.<br>

View File

@ -378,7 +378,7 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
}
} else {
throw new RuntimeException(e);
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
}
}
if (ret != null) {
@ -453,8 +453,30 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
//String afn = conf.getLayer().getActivationFunction();
IActivation afn = layerConf().getActivationFn();
if (helper != null && Shape.strideDescendingCAscendingF(z)) {
INDArray ret = helper.activate(z, layerConf().getActivationFn(), training);
if (helper != null && Shape.strideDescendingCAscendingF(z) && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
INDArray ret = null;
try {
ret = helper.activate(z, layerConf().getActivationFn(), training);
} catch (ND4JOpProfilerException e){
throw e; //NaN panic etc for debugging
} catch (Exception e) {
if (e.getMessage() != null && e.getMessage().contains("Failed to allocate")) {
//This is a memory exception - don't fallback to built-in implementation
throw e;
}
if (layerConf().isCudnnAllowFallback()) {
helperCountFail++;
if (helper instanceof MKLDNNConvHelper) {
log.warn("MKL-DNN execution failed - falling back on built-in implementation", e);
} else {
log.warn("CuDNN execution failed - falling back on built-in implementation", e);
}
} else {
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
}
}
if (ret != null) {
return ret;
}

View File

@ -22,7 +22,7 @@ import java.lang.reflect.Method;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* Base class for MLK-DNN Helpers
* Base class for MKL-DNN Helpers
* @author Alex Black
*/
public class BaseMKLDNNHelper {

View File

@ -41,6 +41,8 @@ public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.n
*/
protected Map<String, INDArray> tBpttStateMap = new ConcurrentHashMap<>();
protected int helperCountFail = 0;
public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) {
super(conf, dataType);
}

View File

@ -17,7 +17,6 @@
package org.deeplearning4j.nn.layers.recurrent;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.api.MaskState;
import org.deeplearning4j.nn.conf.CacheMode;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -90,7 +89,8 @@ public class GravesBidirectionalLSTM
final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this,
this.conf,
this.layerConf().getGateActivationFn(), this.input,
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
@ -98,13 +98,14 @@ public class GravesBidirectionalLSTM
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true,
null, workspaceMgr);
null, workspaceMgr, layerConf().isHelperAllowFallback());
final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr);
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this,
this.conf,
this.layerConf().getGateActivationFn(), this.input,
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
@ -112,7 +113,7 @@ public class GravesBidirectionalLSTM
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
null, workspaceMgr);
null, workspaceMgr, layerConf().isHelperAllowFallback());
//merge the gradient, which is key value pair of String,INDArray
@ -175,7 +176,7 @@ public class GravesBidirectionalLSTM
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
forBackprop || (cacheMode != CacheMode.NONE && training), true,
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null,
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
this.input,
@ -184,7 +185,7 @@ public class GravesBidirectionalLSTM
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
forBackprop || (cacheMode != CacheMode.NONE && training), false,
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
cachedPassForward = forwardsEval;
cachedPassBackward = backwardsEval;
@ -230,7 +231,7 @@ public class GravesBidirectionalLSTM
return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
}
}

View File

@ -17,7 +17,6 @@
package org.deeplearning4j.nn.layers.recurrent;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.api.MaskState;
import org.deeplearning4j.nn.conf.CacheMode;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -92,11 +91,12 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
}
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this,
this.conf, this.layerConf().getGateActivationFn(), this.input,
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
workspaceMgr);
workspaceMgr, layerConf().isHelperAllowFallback());
weightNoiseParams.clear();
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
@ -141,7 +141,7 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
cacheMode, workspaceMgr);
cacheMode, workspaceMgr, layerConf().isHelperAllowFallback());
if (training && cacheMode != CacheMode.NONE) {

View File

@ -17,7 +17,6 @@
package org.deeplearning4j.nn.layers.recurrent;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.api.MaskState;
import org.deeplearning4j.nn.conf.CacheMode;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
@ -32,8 +31,6 @@ import org.nd4j.linalg.primitives.Pair;
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
import org.nd4j.util.OneTimeLogger;
import java.util.Properties;
/**
* LSTM layer implementation.
*
@ -116,10 +113,12 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
}
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this,
this.conf, this.layerConf().getGateActivationFn(), this.input,
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr);
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr,
layerConf().isHelperAllowFallback());
weightNoiseParams.clear();
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
@ -161,7 +160,7 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
if (training && cacheMode != CacheMode.NONE) {
cachedFwdPass = fwd;

View File

@ -29,6 +29,7 @@ import org.deeplearning4j.nn.conf.memory.MemoryReport;
import org.deeplearning4j.nn.gradient.DefaultGradient;
import org.deeplearning4j.nn.gradient.Gradient;
import org.deeplearning4j.nn.layers.BaseLayer;
import org.deeplearning4j.nn.layers.mkldnn.MKLDNNConvHelper;
import org.deeplearning4j.nn.workspace.ArrayType;
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
import org.nd4j.linalg.activations.IActivation;
@ -38,6 +39,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
import org.nd4j.linalg.api.ops.impl.transforms.same.TimesOneMinus;
import org.nd4j.linalg.api.shape.Shape;
import org.nd4j.linalg.exception.ND4JOpProfilerException;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.NDArrayIndex;
import org.nd4j.linalg.primitives.Pair;
@ -81,7 +83,7 @@ public class LSTMHelpers {
* Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
* and rnnTimeStep whilst being reasonably efficient for all
*/
static public FwdPassReturn activateHelper(final BaseLayer layer, final NeuralNetConfiguration conf,
static public FwdPassReturn activateHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
final IActivation gateActivationFn, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray originalInputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
@ -91,7 +93,7 @@ public class LSTMHelpers {
final String inputWeightKey, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
final LSTMHelper helper, final CacheMode cacheMode, // cacheMode for layer calling this helper
final LayerWorkspaceMgr workspaceMgr
final LayerWorkspaceMgr workspaceMgr, boolean isHelperAllowFallback
) {
//Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
@ -198,10 +200,28 @@ public class LSTMHelpers {
prevOutputActivations = Nd4j.zeros(input.dataType(), new long[] {miniBatchSize, hiddenLayerSize});
}
if (helper != null) {
FwdPassReturn ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
FwdPassReturn ret = null;
try {
ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
}catch (ND4JOpProfilerException e){
throw e; //NaN panic etc for debugging
} catch (Exception e){
if(e.getMessage().contains("Failed to allocate")){
//This is a memory exception - don't fallback to built-in implementation
throw e;
}
if(isHelperAllowFallback){
layer.helperCountFail++;
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
} else {
throw new RuntimeException("Error during LSTM MKL/CuDNN helper forward pass - helperAllowFallback() is set to false", e);
}
}
if (ret != null) {
return ret;
}
@ -424,7 +444,7 @@ public class LSTMHelpers {
}
}
static public Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf,
static public Pair<Gradient, INDArray> backpropGradientHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength,
@ -433,7 +453,8 @@ public class LSTMHelpers {
final Map<String, INDArray> gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
final LSTMHelper helper,
final LayerWorkspaceMgr workspaceMgr) {
final LayerWorkspaceMgr workspaceMgr,
final boolean isHelperAllowFallback) {
input = input.castTo(inputWeights.dataType()); //No-op if
@ -496,11 +517,29 @@ public class LSTMHelpers {
rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0));
}
if (helper != null) {
Pair<Gradient, INDArray> ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
hasPeepholeConnections, workspaceMgr);
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
Pair<Gradient, INDArray> ret = null;
try {
ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
hasPeepholeConnections, workspaceMgr);
}catch (ND4JOpProfilerException e){
throw e; //NaN panic etc for debugging
} catch (Exception e){
if(e.getMessage().contains("Failed to allocate")){
//This is a memory exception - don't fallback to built-in implementation
throw e;
}
if(isHelperAllowFallback){
layer.helperCountFail++;
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
} else {
throw new RuntimeException("Error during LSTM MKL/CuDNN helper backprop - helperAllowFallback() is set to false", e);
}
}
if (ret != null) {
return ret;
}

View File

@ -45,7 +45,7 @@ Ideally, these should be excluded from any timing/performance results you report
For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN?
ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably.
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MLK) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MKL) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
3. How are things configured?