Dl4j LSTM and Dropout CuDNN fallback and options (#152)
* add fallback for Conv layer activation Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback and config option for LSTM layers Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback option and setting for dropout Signed-off-by: Ryan Nett <rnett@skymind.io> * fix comments and error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * move helper fail count to layer instance Signed-off-by: Ryan Nett <rnett@skymind.io> * ignore helperCountFail for equals and json Signed-off-by: Ryan Nett <rnett@skymind.io> * typo fix (MLK -> MKL) Signed-off-by: Ryan Nett <rnett@skymind.io> * add MKLDNN to error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * add helperAllowFallback to builders, deprecate cudnnAllowFallback Signed-off-by: Ryan Nett <rnett@skymind.io>master
parent
70af8c2afc
commit
f40bdcf885
|
@ -186,7 +186,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
|
|||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
||||
false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true,
|
||||
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutput;
|
||||
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutput;
|
||||
|
||||
final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(),
|
||||
lstm.input(),
|
||||
|
@ -194,7 +194,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
|
|||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
||||
true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null,
|
||||
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutputAsArrays;
|
||||
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutputAsArrays;
|
||||
|
||||
//I have no idea what the heck this does --Ben
|
||||
for (int i = 0; i < timeSeriesLength; i++) {
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.deeplearning4j.nn.conf.dropout;
|
|||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.deeplearning4j.nn.workspace.ArrayType;
|
||||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||
|
@ -26,11 +28,11 @@ import org.nd4j.linalg.api.buffer.DataType;
|
|||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
||||
import org.nd4j.linalg.api.ops.random.impl.DropOutInverted;
|
||||
import org.nd4j.linalg.exception.ND4JOpProfilerException;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.schedule.ISchedule;
|
||||
import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
|
||||
import org.nd4j.shade.jackson.annotation.JsonProperty;
|
||||
import org.nd4j.util.OneTimeLogger;
|
||||
|
||||
/**
|
||||
* Implements standard (inverted) dropout.<br>
|
||||
|
@ -64,17 +66,29 @@ import org.nd4j.util.OneTimeLogger;
|
|||
* @author Alex Black
|
||||
*/
|
||||
@Data
|
||||
@JsonIgnoreProperties({"mask", "helper"})
|
||||
@EqualsAndHashCode(exclude = {"mask", "helper"})
|
||||
@JsonIgnoreProperties({"mask", "helper", "helperCountFail"})
|
||||
@EqualsAndHashCode(exclude = {"mask", "helper", "helperCountFail"})
|
||||
@Slf4j
|
||||
public class Dropout implements IDropout {
|
||||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
|
||||
*
|
||||
*/
|
||||
@Getter
|
||||
@Setter
|
||||
protected boolean helperAllowFallback = true;
|
||||
|
||||
private double p;
|
||||
private ISchedule pSchedule;
|
||||
private transient INDArray mask;
|
||||
private transient DropoutHelper helper;
|
||||
private boolean initializedHelper = false;
|
||||
|
||||
private int helperCountFail = 0;
|
||||
|
||||
/**
|
||||
* @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc
|
||||
*/
|
||||
|
@ -96,6 +110,18 @@ public class Dropout implements IDropout {
|
|||
this(Double.NaN, activationRetainProbabilitySchedule);
|
||||
}
|
||||
|
||||
/**
|
||||
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||
* (non-helper) implementation for Dropout will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||
*/
|
||||
public Dropout helperAllowFallback(boolean allowFallback) {
|
||||
this.setHelperAllowFallback(allowFallback);
|
||||
return this;
|
||||
}
|
||||
|
||||
protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) {
|
||||
this.p = activationRetainProbability;
|
||||
this.pSchedule = activationRetainProbabilitySchedule;
|
||||
|
@ -141,8 +167,28 @@ public class Dropout implements IDropout {
|
|||
initializeHelper(output.dataType());
|
||||
}
|
||||
|
||||
if(helper != null){
|
||||
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
|
||||
boolean helperWorked = false;
|
||||
try {
|
||||
helper.applyDropout(inputActivations, output, p);
|
||||
helperWorked = true;
|
||||
}catch (ND4JOpProfilerException e){
|
||||
throw e; //NaN panic etc for debugging
|
||||
} catch (Exception e){
|
||||
if(e.getMessage().contains("Failed to allocate")){
|
||||
//This is a memory exception - don't fallback to built-in implementation
|
||||
throw e;
|
||||
}
|
||||
|
||||
if(isHelperAllowFallback()){
|
||||
helperCountFail++;
|
||||
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||
} else {
|
||||
throw new RuntimeException("Error during Dropout CuDNN helper forward pass - helperAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
|
||||
if(helperWorked)
|
||||
return output;
|
||||
}
|
||||
|
||||
|
@ -159,8 +205,28 @@ public class Dropout implements IDropout {
|
|||
|
||||
@Override
|
||||
public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) {
|
||||
if(helper != null){
|
||||
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
|
||||
boolean helperWorked = false;
|
||||
try {
|
||||
helper.backprop(gradAtOutput, gradAtInput);
|
||||
helperWorked = true;
|
||||
}catch (ND4JOpProfilerException e){
|
||||
throw e; //NaN panic etc for debugging
|
||||
} catch (Exception e){
|
||||
if(e.getMessage().contains("Failed to allocate")){
|
||||
//This is a memory exception - don't fallback to built-in implementation
|
||||
throw e;
|
||||
}
|
||||
|
||||
if(isHelperAllowFallback()){
|
||||
helperCountFail++;
|
||||
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||
} else {
|
||||
throw new RuntimeException("Error during Dropout CuDNN helper backprop - helperAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
|
||||
if(helperWorked)
|
||||
return gradAtInput;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
package org.deeplearning4j.nn.conf.layers;
|
||||
|
||||
import lombok.*;
|
||||
import org.deeplearning4j.nn.params.LSTMParamInitializer;
|
||||
import org.deeplearning4j.nn.weights.WeightInit;
|
||||
import org.nd4j.linalg.activations.Activation;
|
||||
import org.nd4j.linalg.activations.IActivation;
|
||||
import org.nd4j.linalg.activations.impl.ActivationSigmoid;
|
||||
|
@ -35,11 +33,13 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
|||
|
||||
protected double forgetGateBiasInit;
|
||||
protected IActivation gateActivationFn = new ActivationSigmoid();
|
||||
protected boolean helperAllowFallback = true;
|
||||
|
||||
protected AbstractLSTM(Builder builder) {
|
||||
super(builder);
|
||||
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
||||
this.gateActivationFn = builder.gateActivationFn;
|
||||
this.helperAllowFallback = builder.helperAllowFallback;
|
||||
}
|
||||
|
||||
@AllArgsConstructor
|
||||
|
@ -60,6 +60,14 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
|||
*/
|
||||
protected IActivation gateActivationFn = new ActivationSigmoid();
|
||||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
|
||||
*
|
||||
*/
|
||||
protected boolean helperAllowFallback = true;
|
||||
|
||||
/**
|
||||
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
||||
* dependencies.
|
||||
|
@ -100,6 +108,18 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
|||
return (T) this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||
* (non-helper) implementation for LSTM/GravesLSTM will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||
*/
|
||||
public T helperAllowFallback(boolean allowFallback) {
|
||||
this.setHelperAllowFallback(allowFallback);
|
||||
return (T) this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -428,16 +428,31 @@ public class BatchNormalization extends FeedForwardLayer {
|
|||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||
* (non-CuDNN) implementation for BatchNormalization will be used
|
||||
*
|
||||
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
@Deprecated
|
||||
public Builder cudnnAllowFallback(boolean allowFallback) {
|
||||
this.setCudnnAllowFallback(allowFallback);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for BatchNormalizationLayer will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
public Builder helperAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* How should the moving average of variance be stored? Two different parameterizations are supported.
|
||||
* useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as
|
||||
|
|
|
@ -533,14 +533,29 @@ public class ConvolutionLayer extends FeedForwardLayer {
|
|||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
||||
*
|
||||
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
@Deprecated
|
||||
public T cudnnAllowFallback(boolean allowFallback) {
|
||||
this.setCudnnAllowFallback(allowFallback);
|
||||
return (T) this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
public T helperAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return (T) this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,11 +53,13 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
|||
|
||||
private double forgetGateBiasInit;
|
||||
private IActivation gateActivationFn = new ActivationSigmoid();
|
||||
protected boolean helperAllowFallback = true;
|
||||
|
||||
private GravesBidirectionalLSTM(Builder builder) {
|
||||
super(builder);
|
||||
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
||||
this.gateActivationFn = builder.gateActivationFn;
|
||||
this.helperAllowFallback = builder.helperAllowFallback;
|
||||
|
||||
initializeConstraints(builder);
|
||||
}
|
||||
|
@ -123,6 +125,14 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
|||
*/
|
||||
private IActivation gateActivationFn = new ActivationSigmoid();
|
||||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* (non-CuDNN) implementation for GravesBidirectionalLSTM will be used
|
||||
*
|
||||
*/
|
||||
protected boolean helperAllowFallback = true;
|
||||
|
||||
/**
|
||||
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
||||
* dependencies.
|
||||
|
@ -163,6 +173,18 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||
* (non-helper) implementation for GravesBidirectionalLSTM will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||
*/
|
||||
public Builder helperAllowFallback(boolean allowFallback) {
|
||||
this.setHelperAllowFallback(allowFallback);
|
||||
return (Builder) this;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public GravesBidirectionalLSTM build() {
|
||||
return new GravesBidirectionalLSTM(this);
|
||||
|
|
|
@ -238,16 +238,31 @@ public class LocalResponseNormalization extends Layer {
|
|||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||
* (non-CuDNN) implementation for BatchNormalization will be used
|
||||
*
|
||||
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
@Deprecated
|
||||
public Builder cudnnAllowFallback(boolean allowFallback) {
|
||||
this.setCudnnAllowFallback(allowFallback);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for LocalResponseNormalizationLayer will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
public Builder helperAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LocalResponseNormalization build() {
|
||||
return new LocalResponseNormalization(this);
|
||||
|
|
|
@ -455,15 +455,30 @@ public class Subsampling3DLayer extends NoParamLayer {
|
|||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
||||
*
|
||||
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
@Deprecated
|
||||
public T cudnnAllowFallback(boolean allowFallback) {
|
||||
this.setCudnnAllowFallback(allowFallback);
|
||||
return (T) this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for Subsampling3DLayer will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
public T helperAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return (T) this;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -480,17 +480,32 @@ public class SubsamplingLayer extends NoParamLayer {
|
|||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
|
||||
*
|
||||
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
@Deprecated
|
||||
public T cudnnAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return (T) this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||
* (non-MKL/CuDNN) implementation for SubsamplingLayer will be used
|
||||
*
|
||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||
*/
|
||||
public T helperAllowFallback(boolean allowFallback) {
|
||||
this.cudnnAllowFallback = allowFallback;
|
||||
return (T) this;
|
||||
}
|
||||
|
||||
/**
|
||||
* When doing average pooling, should the padding values be included in the divisor or not?<br>
|
||||
* Not applicable for max and p-norm pooling.<br>
|
||||
|
|
|
@ -378,7 +378,7 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
|
|||
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException(e);
|
||||
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
if (ret != null) {
|
||||
|
@ -453,8 +453,30 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
|
|||
//String afn = conf.getLayer().getActivationFunction();
|
||||
IActivation afn = layerConf().getActivationFn();
|
||||
|
||||
if (helper != null && Shape.strideDescendingCAscendingF(z)) {
|
||||
INDArray ret = helper.activate(z, layerConf().getActivationFn(), training);
|
||||
if (helper != null && Shape.strideDescendingCAscendingF(z) && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
|
||||
INDArray ret = null;
|
||||
try {
|
||||
ret = helper.activate(z, layerConf().getActivationFn(), training);
|
||||
} catch (ND4JOpProfilerException e){
|
||||
throw e; //NaN panic etc for debugging
|
||||
} catch (Exception e) {
|
||||
if (e.getMessage() != null && e.getMessage().contains("Failed to allocate")) {
|
||||
//This is a memory exception - don't fallback to built-in implementation
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (layerConf().isCudnnAllowFallback()) {
|
||||
helperCountFail++;
|
||||
if (helper instanceof MKLDNNConvHelper) {
|
||||
log.warn("MKL-DNN execution failed - falling back on built-in implementation", e);
|
||||
} else {
|
||||
log.warn("CuDNN execution failed - falling back on built-in implementation", e);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != null) {
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.lang.reflect.Method;
|
|||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/**
|
||||
* Base class for MLK-DNN Helpers
|
||||
* Base class for MKL-DNN Helpers
|
||||
* @author Alex Black
|
||||
*/
|
||||
public class BaseMKLDNNHelper {
|
||||
|
|
|
@ -41,6 +41,8 @@ public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.n
|
|||
*/
|
||||
protected Map<String, INDArray> tBpttStateMap = new ConcurrentHashMap<>();
|
||||
|
||||
protected int helperCountFail = 0;
|
||||
|
||||
public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) {
|
||||
super(conf, dataType);
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.deeplearning4j.nn.layers.recurrent;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.deeplearning4j.nn.api.Layer;
|
||||
import org.deeplearning4j.nn.api.MaskState;
|
||||
import org.deeplearning4j.nn.conf.CacheMode;
|
||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||
|
@ -90,7 +89,8 @@ public class GravesBidirectionalLSTM
|
|||
|
||||
final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);
|
||||
|
||||
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
|
||||
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this,
|
||||
this.conf,
|
||||
this.layerConf().getGateActivationFn(), this.input,
|
||||
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
|
||||
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
|
||||
|
@ -98,13 +98,14 @@ public class GravesBidirectionalLSTM
|
|||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
|
||||
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
|
||||
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true,
|
||||
null, workspaceMgr);
|
||||
null, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
|
||||
|
||||
final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr);
|
||||
|
||||
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
|
||||
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this,
|
||||
this.conf,
|
||||
this.layerConf().getGateActivationFn(), this.input,
|
||||
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
|
||||
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
|
||||
|
@ -112,7 +113,7 @@ public class GravesBidirectionalLSTM
|
|||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
|
||||
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
|
||||
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
|
||||
null, workspaceMgr);
|
||||
null, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
|
||||
//merge the gradient, which is key value pair of String,INDArray
|
||||
|
@ -175,7 +176,7 @@ public class GravesBidirectionalLSTM
|
|||
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
|
||||
forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null,
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
|
||||
this.input,
|
||||
|
@ -184,7 +185,7 @@ public class GravesBidirectionalLSTM
|
|||
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
|
||||
forBackprop || (cacheMode != CacheMode.NONE && training), false,
|
||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
cachedPassForward = forwardsEval;
|
||||
cachedPassBackward = backwardsEval;
|
||||
|
@ -230,7 +231,7 @@ public class GravesBidirectionalLSTM
|
|||
return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||
getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
|
||||
prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
|
||||
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
||||
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.deeplearning4j.nn.layers.recurrent;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.deeplearning4j.nn.api.Layer;
|
||||
import org.deeplearning4j.nn.api.MaskState;
|
||||
import org.deeplearning4j.nn.conf.CacheMode;
|
||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||
|
@ -92,11 +91,12 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
|
|||
}
|
||||
|
||||
|
||||
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this,
|
||||
this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
||||
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
||||
GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
|
||||
workspaceMgr);
|
||||
workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
weightNoiseParams.clear();
|
||||
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
||||
|
@ -141,7 +141,7 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
|
|||
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
||||
prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
||||
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
|
||||
cacheMode, workspaceMgr);
|
||||
cacheMode, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
|
||||
if (training && cacheMode != CacheMode.NONE) {
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.deeplearning4j.nn.layers.recurrent;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.deeplearning4j.nn.api.Layer;
|
||||
import org.deeplearning4j.nn.api.MaskState;
|
||||
import org.deeplearning4j.nn.conf.CacheMode;
|
||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||
|
@ -32,8 +31,6 @@ import org.nd4j.linalg.primitives.Pair;
|
|||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||
import org.nd4j.util.OneTimeLogger;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* LSTM layer implementation.
|
||||
*
|
||||
|
@ -116,10 +113,12 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
|
|||
}
|
||||
|
||||
|
||||
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this,
|
||||
this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
||||
LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
||||
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr);
|
||||
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr,
|
||||
layerConf().isHelperAllowFallback());
|
||||
|
||||
weightNoiseParams.clear();
|
||||
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
||||
|
@ -161,7 +160,7 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
|
|||
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
||||
prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
|
||||
LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||
|
||||
if (training && cacheMode != CacheMode.NONE) {
|
||||
cachedFwdPass = fwd;
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.deeplearning4j.nn.conf.memory.MemoryReport;
|
|||
import org.deeplearning4j.nn.gradient.DefaultGradient;
|
||||
import org.deeplearning4j.nn.gradient.Gradient;
|
||||
import org.deeplearning4j.nn.layers.BaseLayer;
|
||||
import org.deeplearning4j.nn.layers.mkldnn.MKLDNNConvHelper;
|
||||
import org.deeplearning4j.nn.workspace.ArrayType;
|
||||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||
import org.nd4j.linalg.activations.IActivation;
|
||||
|
@ -38,6 +39,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
|
|||
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
||||
import org.nd4j.linalg.api.ops.impl.transforms.same.TimesOneMinus;
|
||||
import org.nd4j.linalg.api.shape.Shape;
|
||||
import org.nd4j.linalg.exception.ND4JOpProfilerException;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.indexing.NDArrayIndex;
|
||||
import org.nd4j.linalg.primitives.Pair;
|
||||
|
@ -81,7 +83,7 @@ public class LSTMHelpers {
|
|||
* Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
|
||||
* and rnnTimeStep whilst being reasonably efficient for all
|
||||
*/
|
||||
static public FwdPassReturn activateHelper(final BaseLayer layer, final NeuralNetConfiguration conf,
|
||||
static public FwdPassReturn activateHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
|
||||
final IActivation gateActivationFn, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
|
||||
INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
||||
final INDArray originalInputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
||||
|
@ -91,7 +93,7 @@ public class LSTMHelpers {
|
|||
final String inputWeightKey, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
||||
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
||||
final LSTMHelper helper, final CacheMode cacheMode, // cacheMode for layer calling this helper
|
||||
final LayerWorkspaceMgr workspaceMgr
|
||||
final LayerWorkspaceMgr workspaceMgr, boolean isHelperAllowFallback
|
||||
) {
|
||||
|
||||
//Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
|
||||
|
@ -198,10 +200,28 @@ public class LSTMHelpers {
|
|||
prevOutputActivations = Nd4j.zeros(input.dataType(), new long[] {miniBatchSize, hiddenLayerSize});
|
||||
}
|
||||
|
||||
if (helper != null) {
|
||||
FwdPassReturn ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
|
||||
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
|
||||
FwdPassReturn ret = null;
|
||||
try {
|
||||
ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
|
||||
biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
|
||||
inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
|
||||
}catch (ND4JOpProfilerException e){
|
||||
throw e; //NaN panic etc for debugging
|
||||
} catch (Exception e){
|
||||
if(e.getMessage().contains("Failed to allocate")){
|
||||
//This is a memory exception - don't fallback to built-in implementation
|
||||
throw e;
|
||||
}
|
||||
|
||||
if(isHelperAllowFallback){
|
||||
layer.helperCountFail++;
|
||||
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
|
||||
} else {
|
||||
throw new RuntimeException("Error during LSTM MKL/CuDNN helper forward pass - helperAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != null) {
|
||||
return ret;
|
||||
}
|
||||
|
@ -424,7 +444,7 @@ public class LSTMHelpers {
|
|||
}
|
||||
}
|
||||
|
||||
static public Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf,
|
||||
static public Pair<Gradient, INDArray> backpropGradientHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
|
||||
final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
||||
final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
||||
final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength,
|
||||
|
@ -433,7 +453,8 @@ public class LSTMHelpers {
|
|||
final Map<String, INDArray> gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
||||
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
||||
final LSTMHelper helper,
|
||||
final LayerWorkspaceMgr workspaceMgr) {
|
||||
final LayerWorkspaceMgr workspaceMgr,
|
||||
final boolean isHelperAllowFallback) {
|
||||
|
||||
input = input.castTo(inputWeights.dataType()); //No-op if
|
||||
|
||||
|
@ -496,11 +517,29 @@ public class LSTMHelpers {
|
|||
rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0));
|
||||
}
|
||||
|
||||
if (helper != null) {
|
||||
Pair<Gradient, INDArray> ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
|
||||
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
|
||||
Pair<Gradient, INDArray> ret = null;
|
||||
try {
|
||||
ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
|
||||
inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
|
||||
inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
|
||||
hasPeepholeConnections, workspaceMgr);
|
||||
}catch (ND4JOpProfilerException e){
|
||||
throw e; //NaN panic etc for debugging
|
||||
} catch (Exception e){
|
||||
if(e.getMessage().contains("Failed to allocate")){
|
||||
//This is a memory exception - don't fallback to built-in implementation
|
||||
throw e;
|
||||
}
|
||||
|
||||
if(isHelperAllowFallback){
|
||||
layer.helperCountFail++;
|
||||
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
|
||||
} else {
|
||||
throw new RuntimeException("Error during LSTM MKL/CuDNN helper backprop - helperAllowFallback() is set to false", e);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != null) {
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ Ideally, these should be excluded from any timing/performance results you report
|
|||
For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN?
|
||||
ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably.
|
||||
|
||||
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MLK) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
|
||||
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MKL) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
|
||||
|
||||
|
||||
3. How are things configured?
|
||||
|
|
Loading…
Reference in New Issue