More fixes (#148)

* Small batch norm fix (cuda/no-mkldnn)

Signed-off-by: Alex Black <blacka101@gmail.com>

* Dropout fix for RnnOutputLayer

Signed-off-by: Alex Black <blacka101@gmail.com>

* Allow block size < 2 in batch_to_space_nd and space_to_batch_nd for import, in spite of what TF docs say

Signed-off-by: AlexDBlack <blacka101@gmail.com>
master
Alex Black 2019-08-22 19:55:27 +10:00 committed by GitHub
parent 9c2bfc9863
commit e855e47f73
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 7 additions and 18 deletions

View File

@ -245,8 +245,8 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
}
//TODO: handle fixed beta/gamma case...
INDArray dBeta = epsilon.sum(0); //dL/dBeta = sum_examples dL/dOut
INDArray dGamma = epsilon.mul(xHat).sum(0); //dL/dGamma = sum_examples dL/dOut .* xHat
INDArray dBeta = epsilon.sum(true, 0); //dL/dBeta = sum_examples dL/dOut
INDArray dGamma = epsilon.mul(xHat).sum(true, 0); //dL/dGamma = sum_examples dL/dOut .* xHat
INDArray dxhat;
if (layerConf.isLockGammaBeta()) {
dxhat = epsilon.mul(layerConf.getGamma());
@ -257,11 +257,11 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
//dL/dVariance
INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true)); //Shape: [1, miniBatch]
INDArray dLdVar = dxhat.mul(xMu).sum(true, 0).muli(-0.5).muli(Transforms.pow(std, -3.0, true)); //Shape: [1, miniBatch]
//dL/dmu
INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
INDArray dxmu1 = dxhat.sum(true, 0).divi(std).negi();
INDArray dxmu2 = xMu.sum(true, 0).muli(-2.0 / batchSize).muli(dLdVar);
INDArray dLdmu = dxmu1.addi(dxmu2); //Shape: [1, nOut]

View File

@ -52,7 +52,6 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
assertInputSet(true);
applyDropOutIfNecessary(true, workspaceMgr); //Edge case: we skip OutputLayer forward pass during training as this isn't required to calculate gradients
if (input.rank() != 3) {
throw new UnsupportedOperationException(
"Input is not rank 3. RnnOutputLayer expects rank 3 input with shape [minibatch, layerInSize, sequenceLength]." +
@ -65,6 +64,8 @@ public class RnnOutputLayer extends BaseOutputLayer<org.deeplearning4j.nn.conf.l
INDArray inputTemp = input;
this.input = TimeSeriesUtils.reshape3dTo2d(input, workspaceMgr, ArrayType.BP_WORKING_MEM);
applyDropOutIfNecessary(true, workspaceMgr); //Edge case: we skip OutputLayer forward pass during training as this isn't required to calculate gradients
Pair<Gradient, INDArray> gradAndEpsilonNext = super.backpropGradient(epsilon, workspaceMgr); //Also applies dropout
this.input = inputTemp;
INDArray epsilon2d = gradAndEpsilonNext.getSecond();

View File

@ -61,12 +61,6 @@ CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
// FIXME - should we use this time-consuming validation ?
for (uint i = 0; i < numOfSpatialDims; ++i) {
const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
REQUIRE_TRUE(blockSize >= 2, 0, "BatchToSpaceND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
}
if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str());

View File

@ -43,12 +43,6 @@ CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) {
REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf());
// FIXME - should we use this time-consuming validation ?
for (uint i = 0; i < numOfSpatialDims; ++i) {
const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatchND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
}
if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) {
const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2]
REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str());