Dl4j LSTM and Dropout CuDNN fallback and options (#152)
* add fallback for Conv layer activation Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback and config option for LSTM layers Signed-off-by: Ryan Nett <rnett@skymind.io> * add fallback option and setting for dropout Signed-off-by: Ryan Nett <rnett@skymind.io> * fix comments and error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * move helper fail count to layer instance Signed-off-by: Ryan Nett <rnett@skymind.io> * ignore helperCountFail for equals and json Signed-off-by: Ryan Nett <rnett@skymind.io> * typo fix (MLK -> MKL) Signed-off-by: Ryan Nett <rnett@skymind.io> * add MKLDNN to error messages Signed-off-by: Ryan Nett <rnett@skymind.io> * add helperAllowFallback to builders, deprecate cudnnAllowFallback Signed-off-by: Ryan Nett <rnett@skymind.io>master
parent
70af8c2afc
commit
f40bdcf885
|
@ -186,7 +186,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
|
||||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
||||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
||||||
false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true,
|
false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true,
|
||||||
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutput;
|
null, CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutput;
|
||||||
|
|
||||||
final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(),
|
final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(),
|
||||||
lstm.input(),
|
lstm.input(),
|
||||||
|
@ -194,7 +194,7 @@ public class GravesBidirectionalLSTMTest extends BaseDL4JTest {
|
||||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS),
|
||||||
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null,
|
||||||
true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null,
|
true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null, true, null,
|
||||||
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces()).fwdPassOutputAsArrays;
|
CacheMode.NONE, LayerWorkspaceMgr.noWorkspaces(), true).fwdPassOutputAsArrays;
|
||||||
|
|
||||||
//I have no idea what the heck this does --Ben
|
//I have no idea what the heck this does --Ben
|
||||||
for (int i = 0; i < timeSeriesLength; i++) {
|
for (int i = 0; i < timeSeriesLength; i++) {
|
||||||
|
|
|
@ -18,6 +18,8 @@ package org.deeplearning4j.nn.conf.dropout;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.deeplearning4j.nn.workspace.ArrayType;
|
import org.deeplearning4j.nn.workspace.ArrayType;
|
||||||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||||
|
@ -26,11 +28,11 @@ import org.nd4j.linalg.api.buffer.DataType;
|
||||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||||
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
||||||
import org.nd4j.linalg.api.ops.random.impl.DropOutInverted;
|
import org.nd4j.linalg.api.ops.random.impl.DropOutInverted;
|
||||||
|
import org.nd4j.linalg.exception.ND4JOpProfilerException;
|
||||||
import org.nd4j.linalg.factory.Nd4j;
|
import org.nd4j.linalg.factory.Nd4j;
|
||||||
import org.nd4j.linalg.schedule.ISchedule;
|
import org.nd4j.linalg.schedule.ISchedule;
|
||||||
import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
|
import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
|
||||||
import org.nd4j.shade.jackson.annotation.JsonProperty;
|
import org.nd4j.shade.jackson.annotation.JsonProperty;
|
||||||
import org.nd4j.util.OneTimeLogger;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implements standard (inverted) dropout.<br>
|
* Implements standard (inverted) dropout.<br>
|
||||||
|
@ -64,17 +66,29 @@ import org.nd4j.util.OneTimeLogger;
|
||||||
* @author Alex Black
|
* @author Alex Black
|
||||||
*/
|
*/
|
||||||
@Data
|
@Data
|
||||||
@JsonIgnoreProperties({"mask", "helper"})
|
@JsonIgnoreProperties({"mask", "helper", "helperCountFail"})
|
||||||
@EqualsAndHashCode(exclude = {"mask", "helper"})
|
@EqualsAndHashCode(exclude = {"mask", "helper", "helperCountFail"})
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class Dropout implements IDropout {
|
public class Dropout implements IDropout {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
|
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
protected boolean helperAllowFallback = true;
|
||||||
|
|
||||||
private double p;
|
private double p;
|
||||||
private ISchedule pSchedule;
|
private ISchedule pSchedule;
|
||||||
private transient INDArray mask;
|
private transient INDArray mask;
|
||||||
private transient DropoutHelper helper;
|
private transient DropoutHelper helper;
|
||||||
private boolean initializedHelper = false;
|
private boolean initializedHelper = false;
|
||||||
|
|
||||||
|
private int helperCountFail = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc
|
* @param activationRetainProbability Probability of retaining an activation - see {@link Dropout} javadoc
|
||||||
*/
|
*/
|
||||||
|
@ -96,6 +110,18 @@ public class Dropout implements IDropout {
|
||||||
this(Double.NaN, activationRetainProbabilitySchedule);
|
this(Double.NaN, activationRetainProbabilitySchedule);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-helper) implementation for Dropout will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||||
|
*/
|
||||||
|
public Dropout helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.setHelperAllowFallback(allowFallback);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) {
|
protected Dropout(@JsonProperty("p") double activationRetainProbability, @JsonProperty("pSchedule") ISchedule activationRetainProbabilitySchedule) {
|
||||||
this.p = activationRetainProbability;
|
this.p = activationRetainProbability;
|
||||||
this.pSchedule = activationRetainProbabilitySchedule;
|
this.pSchedule = activationRetainProbabilitySchedule;
|
||||||
|
@ -141,8 +167,28 @@ public class Dropout implements IDropout {
|
||||||
initializeHelper(output.dataType());
|
initializeHelper(output.dataType());
|
||||||
}
|
}
|
||||||
|
|
||||||
if(helper != null){
|
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
|
||||||
|
boolean helperWorked = false;
|
||||||
|
try {
|
||||||
helper.applyDropout(inputActivations, output, p);
|
helper.applyDropout(inputActivations, output, p);
|
||||||
|
helperWorked = true;
|
||||||
|
}catch (ND4JOpProfilerException e){
|
||||||
|
throw e; //NaN panic etc for debugging
|
||||||
|
} catch (Exception e){
|
||||||
|
if(e.getMessage().contains("Failed to allocate")){
|
||||||
|
//This is a memory exception - don't fallback to built-in implementation
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isHelperAllowFallback()){
|
||||||
|
helperCountFail++;
|
||||||
|
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Error during Dropout CuDNN helper forward pass - helperAllowFallback() is set to false", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(helperWorked)
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,8 +205,28 @@ public class Dropout implements IDropout {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) {
|
public INDArray backprop(INDArray gradAtOutput, INDArray gradAtInput, int iteration, int epoch) {
|
||||||
if(helper != null){
|
if(helper != null && (helperCountFail == 0 || !isHelperAllowFallback())){
|
||||||
|
boolean helperWorked = false;
|
||||||
|
try {
|
||||||
helper.backprop(gradAtOutput, gradAtInput);
|
helper.backprop(gradAtOutput, gradAtInput);
|
||||||
|
helperWorked = true;
|
||||||
|
}catch (ND4JOpProfilerException e){
|
||||||
|
throw e; //NaN panic etc for debugging
|
||||||
|
} catch (Exception e){
|
||||||
|
if(e.getMessage().contains("Failed to allocate")){
|
||||||
|
//This is a memory exception - don't fallback to built-in implementation
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isHelperAllowFallback()){
|
||||||
|
helperCountFail++;
|
||||||
|
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Error during Dropout CuDNN helper backprop - helperAllowFallback() is set to false", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(helperWorked)
|
||||||
return gradAtInput;
|
return gradAtInput;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,6 @@
|
||||||
package org.deeplearning4j.nn.conf.layers;
|
package org.deeplearning4j.nn.conf.layers;
|
||||||
|
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
import org.deeplearning4j.nn.params.LSTMParamInitializer;
|
|
||||||
import org.deeplearning4j.nn.weights.WeightInit;
|
|
||||||
import org.nd4j.linalg.activations.Activation;
|
import org.nd4j.linalg.activations.Activation;
|
||||||
import org.nd4j.linalg.activations.IActivation;
|
import org.nd4j.linalg.activations.IActivation;
|
||||||
import org.nd4j.linalg.activations.impl.ActivationSigmoid;
|
import org.nd4j.linalg.activations.impl.ActivationSigmoid;
|
||||||
|
@ -35,11 +33,13 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
||||||
|
|
||||||
protected double forgetGateBiasInit;
|
protected double forgetGateBiasInit;
|
||||||
protected IActivation gateActivationFn = new ActivationSigmoid();
|
protected IActivation gateActivationFn = new ActivationSigmoid();
|
||||||
|
protected boolean helperAllowFallback = true;
|
||||||
|
|
||||||
protected AbstractLSTM(Builder builder) {
|
protected AbstractLSTM(Builder builder) {
|
||||||
super(builder);
|
super(builder);
|
||||||
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
||||||
this.gateActivationFn = builder.gateActivationFn;
|
this.gateActivationFn = builder.gateActivationFn;
|
||||||
|
this.helperAllowFallback = builder.helperAllowFallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@ -60,6 +60,14 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
||||||
*/
|
*/
|
||||||
protected IActivation gateActivationFn = new ActivationSigmoid();
|
protected IActivation gateActivationFn = new ActivationSigmoid();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
|
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-CuDNN) implementation for LSTM/GravesLSTM will be used
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
protected boolean helperAllowFallback = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
||||||
* dependencies.
|
* dependencies.
|
||||||
|
@ -100,6 +108,18 @@ public abstract class AbstractLSTM extends BaseRecurrentLayer {
|
||||||
return (T) this;
|
return (T) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-helper) implementation for LSTM/GravesLSTM will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||||
|
*/
|
||||||
|
public T helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.setHelperAllowFallback(allowFallback);
|
||||||
|
return (T) this;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -428,16 +428,31 @@ public class BatchNormalization extends FeedForwardLayer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||||
* (non-CuDNN) implementation for BatchNormalization will be used
|
* (non-CuDNN) implementation for BatchNormalization will be used
|
||||||
*
|
*
|
||||||
|
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||||
|
*
|
||||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public Builder cudnnAllowFallback(boolean allowFallback) {
|
public Builder cudnnAllowFallback(boolean allowFallback) {
|
||||||
this.setCudnnAllowFallback(allowFallback);
|
this.setCudnnAllowFallback(allowFallback);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
|
* (non-MKL/CuDNN) implementation for BatchNormalizationLayer will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
|
*/
|
||||||
|
public Builder helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.cudnnAllowFallback = allowFallback;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* How should the moving average of variance be stored? Two different parameterizations are supported.
|
* How should the moving average of variance be stored? Two different parameterizations are supported.
|
||||||
* useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as
|
* useLogStd(false): equivalent to 1.0.0-beta3 and earlier. The variance "parameter" is stored directly as
|
||||||
|
|
|
@ -533,14 +533,29 @@ public class ConvolutionLayer extends FeedForwardLayer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
||||||
*
|
*
|
||||||
|
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||||
|
*
|
||||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public T cudnnAllowFallback(boolean allowFallback) {
|
public T cudnnAllowFallback(boolean allowFallback) {
|
||||||
this.setCudnnAllowFallback(allowFallback);
|
this.setCudnnAllowFallback(allowFallback);
|
||||||
return (T) this;
|
return (T) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
|
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
|
*/
|
||||||
|
public T helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.cudnnAllowFallback = allowFallback;
|
||||||
|
return (T) this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,11 +53,13 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
||||||
|
|
||||||
private double forgetGateBiasInit;
|
private double forgetGateBiasInit;
|
||||||
private IActivation gateActivationFn = new ActivationSigmoid();
|
private IActivation gateActivationFn = new ActivationSigmoid();
|
||||||
|
protected boolean helperAllowFallback = true;
|
||||||
|
|
||||||
private GravesBidirectionalLSTM(Builder builder) {
|
private GravesBidirectionalLSTM(Builder builder) {
|
||||||
super(builder);
|
super(builder);
|
||||||
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
this.forgetGateBiasInit = builder.forgetGateBiasInit;
|
||||||
this.gateActivationFn = builder.gateActivationFn;
|
this.gateActivationFn = builder.gateActivationFn;
|
||||||
|
this.helperAllowFallback = builder.helperAllowFallback;
|
||||||
|
|
||||||
initializeConstraints(builder);
|
initializeConstraints(builder);
|
||||||
}
|
}
|
||||||
|
@ -123,6 +125,14 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
||||||
*/
|
*/
|
||||||
private IActivation gateActivationFn = new ActivationSigmoid();
|
private IActivation gateActivationFn = new ActivationSigmoid();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
|
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-CuDNN) implementation for GravesBidirectionalLSTM will be used
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
protected boolean helperAllowFallback = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
* Set forget gate bias initalizations. Values in range 1-5 can potentially help with learning or longer-term
|
||||||
* dependencies.
|
* dependencies.
|
||||||
|
@ -163,6 +173,18 @@ public class GravesBidirectionalLSTM extends BaseRecurrentLayer {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using a helper (CuDNN or MKLDNN in some cases) and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If false, the built-in
|
||||||
|
* (non-helper) implementation for GravesBidirectionalLSTM will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-helper implementation should be used
|
||||||
|
*/
|
||||||
|
public Builder helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.setHelperAllowFallback(allowFallback);
|
||||||
|
return (Builder) this;
|
||||||
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public GravesBidirectionalLSTM build() {
|
public GravesBidirectionalLSTM build() {
|
||||||
return new GravesBidirectionalLSTM(this);
|
return new GravesBidirectionalLSTM(this);
|
||||||
|
|
|
@ -238,16 +238,31 @@ public class LocalResponseNormalization extends Layer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||||
* (non-CuDNN) implementation for BatchNormalization will be used
|
* (non-CuDNN) implementation for BatchNormalization will be used
|
||||||
*
|
*
|
||||||
|
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||||
|
*
|
||||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public Builder cudnnAllowFallback(boolean allowFallback) {
|
public Builder cudnnAllowFallback(boolean allowFallback) {
|
||||||
this.setCudnnAllowFallback(allowFallback);
|
this.setCudnnAllowFallback(allowFallback);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
|
* (non-MKL/CuDNN) implementation for LocalResponseNormalizationLayer will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
|
*/
|
||||||
|
public Builder helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.cudnnAllowFallback = allowFallback;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LocalResponseNormalization build() {
|
public LocalResponseNormalization build() {
|
||||||
return new LocalResponseNormalization(this);
|
return new LocalResponseNormalization(this);
|
||||||
|
|
|
@ -455,15 +455,30 @@ public class Subsampling3DLayer extends NoParamLayer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
||||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
* If set to false, an exception in CuDNN will be propagated back to the user. If true, the built-in
|
||||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
||||||
*
|
*
|
||||||
|
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||||
|
*
|
||||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public T cudnnAllowFallback(boolean allowFallback) {
|
public T cudnnAllowFallback(boolean allowFallback) {
|
||||||
this.setCudnnAllowFallback(allowFallback);
|
this.setCudnnAllowFallback(allowFallback);
|
||||||
return (T) this;
|
return (T) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
|
* (non-MKL/CuDNN) implementation for Subsampling3DLayer will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
|
*/
|
||||||
|
public T helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.cudnnAllowFallback = allowFallback;
|
||||||
|
return (T) this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -480,17 +480,32 @@ public class SubsamplingLayer extends NoParamLayer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using CuDNN and an error is encountered, should fallback to the non-CuDNN implementatation be allowed?
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
* If set to false, an exception in CuDNN will be propagated back to the user. If false, the built-in
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
* (non-CuDNN) implementation for ConvolutionLayer will be used
|
* (non-MKL/CuDNN) implementation for ConvolutionLayer will be used
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link #helperAllowFallback(boolean)}
|
||||||
*
|
*
|
||||||
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public T cudnnAllowFallback(boolean allowFallback) {
|
public T cudnnAllowFallback(boolean allowFallback) {
|
||||||
this.cudnnAllowFallback = allowFallback;
|
this.cudnnAllowFallback = allowFallback;
|
||||||
return (T) this;
|
return (T) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When using CuDNN or MKLDNN and an error is encountered, should fallback to the non-helper implementation be allowed?
|
||||||
|
* If set to false, an exception in the helper will be propagated back to the user. If true, the built-in
|
||||||
|
* (non-MKL/CuDNN) implementation for SubsamplingLayer will be used
|
||||||
|
*
|
||||||
|
* @param allowFallback Whether fallback to non-CuDNN implementation should be used
|
||||||
|
*/
|
||||||
|
public T helperAllowFallback(boolean allowFallback) {
|
||||||
|
this.cudnnAllowFallback = allowFallback;
|
||||||
|
return (T) this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When doing average pooling, should the padding values be included in the divisor or not?<br>
|
* When doing average pooling, should the padding values be included in the divisor or not?<br>
|
||||||
* Not applicable for max and p-norm pooling.<br>
|
* Not applicable for max and p-norm pooling.<br>
|
||||||
|
|
|
@ -378,7 +378,7 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
|
||||||
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
log.warn("CuDNN execution failed - falling back on built-in implementation",e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ret != null) {
|
if (ret != null) {
|
||||||
|
@ -453,8 +453,30 @@ public class ConvolutionLayer extends BaseLayer<org.deeplearning4j.nn.conf.layer
|
||||||
//String afn = conf.getLayer().getActivationFunction();
|
//String afn = conf.getLayer().getActivationFunction();
|
||||||
IActivation afn = layerConf().getActivationFn();
|
IActivation afn = layerConf().getActivationFn();
|
||||||
|
|
||||||
if (helper != null && Shape.strideDescendingCAscendingF(z)) {
|
if (helper != null && Shape.strideDescendingCAscendingF(z) && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
|
||||||
INDArray ret = helper.activate(z, layerConf().getActivationFn(), training);
|
INDArray ret = null;
|
||||||
|
try {
|
||||||
|
ret = helper.activate(z, layerConf().getActivationFn(), training);
|
||||||
|
} catch (ND4JOpProfilerException e){
|
||||||
|
throw e; //NaN panic etc for debugging
|
||||||
|
} catch (Exception e) {
|
||||||
|
if (e.getMessage() != null && e.getMessage().contains("Failed to allocate")) {
|
||||||
|
//This is a memory exception - don't fallback to built-in implementation
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layerConf().isCudnnAllowFallback()) {
|
||||||
|
helperCountFail++;
|
||||||
|
if (helper instanceof MKLDNNConvHelper) {
|
||||||
|
log.warn("MKL-DNN execution failed - falling back on built-in implementation", e);
|
||||||
|
} else {
|
||||||
|
log.warn("CuDNN execution failed - falling back on built-in implementation", e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Error during ConvolutionLayer MKL/CuDNN helper forward pass - isCudnnAllowFallback() is set to false", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ret != null) {
|
if (ret != null) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.lang.reflect.Method;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class for MLK-DNN Helpers
|
* Base class for MKL-DNN Helpers
|
||||||
* @author Alex Black
|
* @author Alex Black
|
||||||
*/
|
*/
|
||||||
public class BaseMKLDNNHelper {
|
public class BaseMKLDNNHelper {
|
||||||
|
|
|
@ -41,6 +41,8 @@ public abstract class BaseRecurrentLayer<LayerConfT extends org.deeplearning4j.n
|
||||||
*/
|
*/
|
||||||
protected Map<String, INDArray> tBpttStateMap = new ConcurrentHashMap<>();
|
protected Map<String, INDArray> tBpttStateMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
protected int helperCountFail = 0;
|
||||||
|
|
||||||
public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) {
|
public BaseRecurrentLayer(NeuralNetConfiguration conf, DataType dataType) {
|
||||||
super(conf, dataType);
|
super(conf, dataType);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
package org.deeplearning4j.nn.layers.recurrent;
|
package org.deeplearning4j.nn.layers.recurrent;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.deeplearning4j.nn.api.Layer;
|
|
||||||
import org.deeplearning4j.nn.api.MaskState;
|
import org.deeplearning4j.nn.api.MaskState;
|
||||||
import org.deeplearning4j.nn.conf.CacheMode;
|
import org.deeplearning4j.nn.conf.CacheMode;
|
||||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||||
|
@ -90,7 +89,8 @@ public class GravesBidirectionalLSTM
|
||||||
|
|
||||||
final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);
|
final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true, workspaceMgr);
|
||||||
|
|
||||||
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
|
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this,
|
||||||
|
this.conf,
|
||||||
this.layerConf().getGateActivationFn(), this.input,
|
this.layerConf().getGateActivationFn(), this.input,
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
|
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS),
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
|
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon,
|
||||||
|
@ -98,13 +98,14 @@ public class GravesBidirectionalLSTM
|
||||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
|
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS,
|
||||||
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
|
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS,
|
||||||
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true,
|
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray, true,
|
||||||
null, workspaceMgr);
|
null, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr);
|
final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false, workspaceMgr);
|
||||||
|
|
||||||
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf,
|
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this,
|
||||||
|
this.conf,
|
||||||
this.layerConf().getGateActivationFn(), this.input,
|
this.layerConf().getGateActivationFn(), this.input,
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
|
getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS),
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
|
getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon,
|
||||||
|
@ -112,7 +113,7 @@ public class GravesBidirectionalLSTM
|
||||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
|
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS,
|
||||||
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
|
GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS,
|
||||||
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
|
GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray, true,
|
||||||
null, workspaceMgr);
|
null, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
|
|
||||||
//merge the gradient, which is key value pair of String,INDArray
|
//merge the gradient, which is key value pair of String,INDArray
|
||||||
|
@ -175,7 +176,7 @@ public class GravesBidirectionalLSTM
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
|
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), training, null, null,
|
||||||
forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
||||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null,
|
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, maskArray, true, null,
|
||||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
|
backwardsEval = LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(),
|
||||||
this.input,
|
this.input,
|
||||||
|
@ -184,7 +185,7 @@ public class GravesBidirectionalLSTM
|
||||||
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
|
getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS), training, null, null,
|
||||||
forBackprop || (cacheMode != CacheMode.NONE && training), false,
|
forBackprop || (cacheMode != CacheMode.NONE && training), false,
|
||||||
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
|
GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, maskArray, true, null,
|
||||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
cachedPassForward = forwardsEval;
|
cachedPassForward = forwardsEval;
|
||||||
cachedPassBackward = backwardsEval;
|
cachedPassBackward = backwardsEval;
|
||||||
|
@ -230,7 +231,7 @@ public class GravesBidirectionalLSTM
|
||||||
return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
|
return LSTMHelpers.activateHelper(this, this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||||
getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
|
getParam(recurrentKey), getParam(inputKey), getParam(biasKey), training,
|
||||||
prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
|
prevOutputActivations, prevMemCellState, forBackprop, forwards, inputKey, maskArray, true,
|
||||||
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
null, forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
package org.deeplearning4j.nn.layers.recurrent;
|
package org.deeplearning4j.nn.layers.recurrent;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.deeplearning4j.nn.api.Layer;
|
|
||||||
import org.deeplearning4j.nn.api.MaskState;
|
import org.deeplearning4j.nn.api.MaskState;
|
||||||
import org.deeplearning4j.nn.conf.CacheMode;
|
import org.deeplearning4j.nn.conf.CacheMode;
|
||||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||||
|
@ -92,11 +91,12 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
|
Pair<Gradient, INDArray> p = LSTMHelpers.backpropGradientHelper(this,
|
||||||
|
this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||||
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
||||||
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, GravesLSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
||||||
GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
|
GravesLSTMParamInitializer.BIAS_KEY, gradientViews, maskArray, true, null,
|
||||||
workspaceMgr);
|
workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
weightNoiseParams.clear();
|
weightNoiseParams.clear();
|
||||||
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
||||||
|
@ -141,7 +141,7 @@ public class GravesLSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.la
|
||||||
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
||||||
prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
prevMemCellState, forBackprop || (cacheMode != CacheMode.NONE && training), true,
|
||||||
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
|
GravesLSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, true, null,
|
||||||
cacheMode, workspaceMgr);
|
cacheMode, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
|
|
||||||
if (training && cacheMode != CacheMode.NONE) {
|
if (training && cacheMode != CacheMode.NONE) {
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
package org.deeplearning4j.nn.layers.recurrent;
|
package org.deeplearning4j.nn.layers.recurrent;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.deeplearning4j.nn.api.Layer;
|
|
||||||
import org.deeplearning4j.nn.api.MaskState;
|
import org.deeplearning4j.nn.api.MaskState;
|
||||||
import org.deeplearning4j.nn.conf.CacheMode;
|
import org.deeplearning4j.nn.conf.CacheMode;
|
||||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||||
|
@ -32,8 +31,6 @@ import org.nd4j.linalg.primitives.Pair;
|
||||||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||||
import org.nd4j.util.OneTimeLogger;
|
import org.nd4j.util.OneTimeLogger;
|
||||||
|
|
||||||
import java.util.Properties;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* LSTM layer implementation.
|
* LSTM layer implementation.
|
||||||
*
|
*
|
||||||
|
@ -116,10 +113,12 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input,
|
Pair<Gradient,INDArray> p = LSTMHelpers.backpropGradientHelper(this,
|
||||||
|
this.conf, this.layerConf().getGateActivationFn(), this.input,
|
||||||
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
recurrentWeights, inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true,
|
||||||
LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
LSTMParamInitializer.INPUT_WEIGHT_KEY, LSTMParamInitializer.RECURRENT_WEIGHT_KEY,
|
||||||
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr);
|
LSTMParamInitializer.BIAS_KEY, gradientViews, null, false, helper, workspaceMgr,
|
||||||
|
layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
weightNoiseParams.clear();
|
weightNoiseParams.clear();
|
||||||
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
p.setSecond(backpropDropOutIfPresent(p.getSecond()));
|
||||||
|
@ -161,7 +160,7 @@ public class LSTM extends BaseRecurrentLayer<org.deeplearning4j.nn.conf.layers.L
|
||||||
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
this.input, recurrentWeights, inputWeights, biases, training, prevOutputActivations,
|
||||||
prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
|
prevMemCellState, (training && cacheMode != CacheMode.NONE) || forBackprop, true,
|
||||||
LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
|
LSTMParamInitializer.INPUT_WEIGHT_KEY, maskArray, false, helper,
|
||||||
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr);
|
forBackprop ? cacheMode : CacheMode.NONE, workspaceMgr, layerConf().isHelperAllowFallback());
|
||||||
|
|
||||||
if (training && cacheMode != CacheMode.NONE) {
|
if (training && cacheMode != CacheMode.NONE) {
|
||||||
cachedFwdPass = fwd;
|
cachedFwdPass = fwd;
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.deeplearning4j.nn.conf.memory.MemoryReport;
|
||||||
import org.deeplearning4j.nn.gradient.DefaultGradient;
|
import org.deeplearning4j.nn.gradient.DefaultGradient;
|
||||||
import org.deeplearning4j.nn.gradient.Gradient;
|
import org.deeplearning4j.nn.gradient.Gradient;
|
||||||
import org.deeplearning4j.nn.layers.BaseLayer;
|
import org.deeplearning4j.nn.layers.BaseLayer;
|
||||||
|
import org.deeplearning4j.nn.layers.mkldnn.MKLDNNConvHelper;
|
||||||
import org.deeplearning4j.nn.workspace.ArrayType;
|
import org.deeplearning4j.nn.workspace.ArrayType;
|
||||||
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
|
||||||
import org.nd4j.linalg.activations.IActivation;
|
import org.nd4j.linalg.activations.IActivation;
|
||||||
|
@ -38,6 +39,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
|
||||||
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.MulOp;
|
||||||
import org.nd4j.linalg.api.ops.impl.transforms.same.TimesOneMinus;
|
import org.nd4j.linalg.api.ops.impl.transforms.same.TimesOneMinus;
|
||||||
import org.nd4j.linalg.api.shape.Shape;
|
import org.nd4j.linalg.api.shape.Shape;
|
||||||
|
import org.nd4j.linalg.exception.ND4JOpProfilerException;
|
||||||
import org.nd4j.linalg.factory.Nd4j;
|
import org.nd4j.linalg.factory.Nd4j;
|
||||||
import org.nd4j.linalg.indexing.NDArrayIndex;
|
import org.nd4j.linalg.indexing.NDArrayIndex;
|
||||||
import org.nd4j.linalg.primitives.Pair;
|
import org.nd4j.linalg.primitives.Pair;
|
||||||
|
@ -81,7 +83,7 @@ public class LSTMHelpers {
|
||||||
* Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
|
* Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
|
||||||
* and rnnTimeStep whilst being reasonably efficient for all
|
* and rnnTimeStep whilst being reasonably efficient for all
|
||||||
*/
|
*/
|
||||||
static public FwdPassReturn activateHelper(final BaseLayer layer, final NeuralNetConfiguration conf,
|
static public FwdPassReturn activateHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
|
||||||
final IActivation gateActivationFn, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
|
final IActivation gateActivationFn, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
|
||||||
INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
||||||
final INDArray originalInputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
final INDArray originalInputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
||||||
|
@ -91,7 +93,7 @@ public class LSTMHelpers {
|
||||||
final String inputWeightKey, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
final String inputWeightKey, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
||||||
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
||||||
final LSTMHelper helper, final CacheMode cacheMode, // cacheMode for layer calling this helper
|
final LSTMHelper helper, final CacheMode cacheMode, // cacheMode for layer calling this helper
|
||||||
final LayerWorkspaceMgr workspaceMgr
|
final LayerWorkspaceMgr workspaceMgr, boolean isHelperAllowFallback
|
||||||
) {
|
) {
|
||||||
|
|
||||||
//Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
|
//Mini-batch data format: for mini-batch size m, nIn inputs, and T time series length
|
||||||
|
@ -198,10 +200,28 @@ public class LSTMHelpers {
|
||||||
prevOutputActivations = Nd4j.zeros(input.dataType(), new long[] {miniBatchSize, hiddenLayerSize});
|
prevOutputActivations = Nd4j.zeros(input.dataType(), new long[] {miniBatchSize, hiddenLayerSize});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (helper != null) {
|
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
|
||||||
FwdPassReturn ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
|
FwdPassReturn ret = null;
|
||||||
|
try {
|
||||||
|
ret = helper.activate(layer, conf, gateActivationFn, input, recurrentWeights, inputWeights,
|
||||||
biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
|
biases, training, prevOutputActivations, prevMemCellState, forBackprop, forwards,
|
||||||
inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
|
inputWeightKey, maskArray, hasPeepholeConnections, workspaceMgr);
|
||||||
|
}catch (ND4JOpProfilerException e){
|
||||||
|
throw e; //NaN panic etc for debugging
|
||||||
|
} catch (Exception e){
|
||||||
|
if(e.getMessage().contains("Failed to allocate")){
|
||||||
|
//This is a memory exception - don't fallback to built-in implementation
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isHelperAllowFallback){
|
||||||
|
layer.helperCountFail++;
|
||||||
|
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Error during LSTM MKL/CuDNN helper forward pass - helperAllowFallback() is set to false", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ret != null) {
|
if (ret != null) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -424,7 +444,7 @@ public class LSTMHelpers {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static public Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf,
|
static public Pair<Gradient, INDArray> backpropGradientHelper(final BaseRecurrentLayer layer, final NeuralNetConfiguration conf,
|
||||||
final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
final IActivation gateActivationFn, INDArray input, final INDArray recurrentWeights, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
|
||||||
final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
final INDArray inputWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
|
||||||
final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength,
|
final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength,
|
||||||
|
@ -433,7 +453,8 @@ public class LSTMHelpers {
|
||||||
final Map<String, INDArray> gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
final Map<String, INDArray> gradientViews, INDArray maskArray, //Input mask: should only be used with bidirectional RNNs + variable length
|
||||||
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
final boolean hasPeepholeConnections, //True for GravesLSTM, false for LSTM
|
||||||
final LSTMHelper helper,
|
final LSTMHelper helper,
|
||||||
final LayerWorkspaceMgr workspaceMgr) {
|
final LayerWorkspaceMgr workspaceMgr,
|
||||||
|
final boolean isHelperAllowFallback) {
|
||||||
|
|
||||||
input = input.castTo(inputWeights.dataType()); //No-op if
|
input = input.castTo(inputWeights.dataType()); //No-op if
|
||||||
|
|
||||||
|
@ -496,11 +517,29 @@ public class LSTMHelpers {
|
||||||
rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0));
|
rwGradientsGG = rwGradientsOut.get(all(), NDArrayIndex.point(4 * hiddenLayerSize + 2)).reshape(1, recurrentWeights.size(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (helper != null) {
|
if (helper != null && (layer.helperCountFail == 0 || !isHelperAllowFallback)) {
|
||||||
Pair<Gradient, INDArray> ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
|
Pair<Gradient, INDArray> ret = null;
|
||||||
|
try {
|
||||||
|
ret = helper.backpropGradient(conf, gateActivationFn, input, recurrentWeights,
|
||||||
inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
|
inputWeights, epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, forwards,
|
||||||
inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
|
inputWeightKey, recurrentWeightKey, biasWeightKey, gradientViews, maskArray,
|
||||||
hasPeepholeConnections, workspaceMgr);
|
hasPeepholeConnections, workspaceMgr);
|
||||||
|
}catch (ND4JOpProfilerException e){
|
||||||
|
throw e; //NaN panic etc for debugging
|
||||||
|
} catch (Exception e){
|
||||||
|
if(e.getMessage().contains("Failed to allocate")){
|
||||||
|
//This is a memory exception - don't fallback to built-in implementation
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isHelperAllowFallback){
|
||||||
|
layer.helperCountFail++;
|
||||||
|
log.warn("MKL/CuDNN execution failed - falling back on built-in implementation",e);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Error during LSTM MKL/CuDNN helper backprop - helperAllowFallback() is set to false", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ret != null) {
|
if (ret != null) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ Ideally, these should be excluded from any timing/performance results you report
|
||||||
For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN?
|
For example: what BLAS implementation (MKL, OpenBLAS, etc)? If you are using CUDA, are you using CuDNN?
|
||||||
ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably.
|
ND4J and DL4J can use these libraries (MKL, CuDNN) when they are available - but are not always available by default. If they are not made available, performance can be lower - sometimes considerably.
|
||||||
|
|
||||||
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MLK) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
|
This is especially important when comparing results between libraries: for example, if you compared two libraries (one using OpenBLAS, another using MKL) your results may simply reflect the performance differences it the BLAS library being used - and not the performance oth the libraries being tested. Similarly, one library with CuDNN and another without CuDNN may simply reflect the performance benefit of using CuDNN.
|
||||||
|
|
||||||
|
|
||||||
3. How are things configured?
|
3. How are things configured?
|
||||||
|
|
Loading…
Reference in New Issue