More fixes (#148)
* Small batch norm fix (cuda/no-mkldnn) Signed-off-by: Alex Black <blacka101@gmail.com> * Dropout fix for RnnOutputLayer Signed-off-by: Alex Black <blacka101@gmail.com> * Allow block size < 2 in batch_to_space_nd and space_to_batch_nd for import, in spite of what TF docs say Signed-off-by: AlexDBlack <blacka101@gmail.com>master
parent
9c2bfc9863
commit
e855e47f73
|
@ -245,8 +245,8 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: handle fixed beta/gamma case...
|
//TODO: handle fixed beta/gamma case...
|
||||||
INDArray dBeta = epsilon.sum(0); //dL/dBeta = sum_examples dL/dOut
|
INDArray dBeta = epsilon.sum(true, 0); //dL/dBeta = sum_examples dL/dOut
|
||||||
INDArray dGamma = epsilon.mul(xHat).sum(0); //dL/dGamma = sum_examples dL/dOut .* xHat
|
INDArray dGamma = epsilon.mul(xHat).sum(true, 0); //dL/dGamma = sum_examples dL/dOut .* xHat
|
||||||
INDArray dxhat;
|
INDArray dxhat;
|
||||||
if (layerConf.isLockGammaBeta()) {
|
if (layerConf.isLockGammaBeta()) {
|
||||||
dxhat = epsilon.mul(layerConf.getGamma());
|
dxhat = epsilon.mul(layerConf.getGamma());
|
||||||
|
@ -257,11 +257,11 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
|
||||||
|
|
||||||
|
|
||||||
//dL/dVariance
|
//dL/dVariance
|
||||||
INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true)); //Shape: [1, miniBatch]
|
INDArray dLdVar = dxhat.mul(xMu).sum(true, 0).muli(-0.5).muli(Transforms.pow(std, -3.0, true)); //Shape: [1, miniBatch]
|
||||||
|
|
||||||
//dL/dmu
|
//dL/dmu
|
||||||
INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
|
INDArray dxmu1 = dxhat.sum(true, 0).divi(std).negi();
|
||||||
INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
|
INDArray dxmu2 = xMu.sum(true, 0).muli(-2.0 / batchSize).muli(dLdVar);
|
||||||
|
|
||||||
INDArray dLdmu = dxmu1.addi(dxmu2); //Shape: [1, nOut]
|
INDArray dLdmu = dxmu1.addi(dxmu2); //Shape: [1, nOut]
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,6 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
|
||||||
@Override
|
@Override
|
||||||
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
|
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
|
||||||
assertInputSet(true);
|
assertInputSet(true);
|
||||||
applyDropOutIfNecessary(true, workspaceMgr); //Edge case: we skip OutputLayer forward pass during training as this isn't required to calculate gradients
|
|
||||||
if (input.rank() != 3) {
|
if (input.rank() != 3) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"Input is not rank 3. RnnOutputLayer expects rank 3 input with shape [minibatch, layerInSize, sequenceLength]." +
|
"Input is not rank 3. RnnOutputLayer expects rank 3 input with shape [minibatch, layerInSize, sequenceLength]." +
|
||||||
|
@ -65,6 +64,8 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
|
||||||
INDArray inputTemp = input;
|
INDArray inputTemp = input;
|
||||||
this.input = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.BP_WORKING_MEM);
|
this.input = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.BP_WORKING_MEM);
|
||||||
|
|
||||||
|
applyDropOutIfNecessary(true, workspaceMgr); //Edge case: we skip OutputLayer forward pass during training as this isn't required to calculate gradients
|
||||||
|
|
||||||
Pair<Gradient, INDArray> gradAndEpsilonNext = super.backpropGradient(epsilon, workspaceMgr); //Also applies dropout
|
Pair<Gradient, INDArray> gradAndEpsilonNext = super.backpropGradient(epsilon, workspaceMgr); //Also applies dropout
|
||||||
this.input = inputTemp;
|
this.input = inputTemp;
|
||||||
INDArray epsilon2d = gradAndEpsilonNext.getSecond();
|
INDArray epsilon2d = gradAndEpsilonNext.getSecond();
|
||||||
|
|
|
@ -61,12 +61,6 @@ CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
|
||||||
const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
|
const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
|
||||||
REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
|
REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
|
||||||
|
|
||||||
// FIXME - should we use this time-consuming validation ?
|
|
||||||
for (uint i = 0; i < numOfSpatialDims; ++i) {
|
|
||||||
const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
|
|
||||||
REQUIRE_TRUE(blockSize >= 2, 0, "BatchToSpaceND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
|
if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
|
||||||
const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
|
const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
|
||||||
REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str());
|
REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str());
|
||||||
|
|
|
@ -43,12 +43,6 @@ CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) {
|
||||||
|
|
||||||
REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf());
|
REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf());
|
||||||
|
|
||||||
// FIXME - should we use this time-consuming validation ?
|
|
||||||
for (uint i = 0; i < numOfSpatialDims; ++i) {
|
|
||||||
const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
|
|
||||||
REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatchND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) {
|
if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) {
|
||||||
const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
|
const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
|
||||||
REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str());
|
REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str());
|
||||||
|
|
Loading…
Reference in New Issue