diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/SubsamplingLayerTest.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/SubsamplingLayerTest.java
index 69f8c22db..e0e556f39 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/SubsamplingLayerTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/convolution/SubsamplingLayerTest.java
@@ -155,7 +155,7 @@ public class SubsamplingLayerTest extends BaseDL4JTest {
     }
 
 
-    @Test(expected = IllegalStateException.class)
+    @Test(expected = UnsupportedOperationException.class)
     public void testSubSampleLayerSumBackprop() throws Exception {
         Layer layer = getSubsamplingLayer(SubsamplingLayer.PoolingType.SUM);
         INDArray input = getData();
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SpaceToDepthLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SpaceToDepthLayer.java
index aeca265f8..44f8bb666 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SpaceToDepthLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SpaceToDepthLayer.java
@@ -92,7 +92,6 @@ public class SpaceToDepthLayer extends NoParamLayer {
 
     @Override
     public LayerMemoryReport getMemoryReport(InputType inputType) {
-        InputType.InputTypeConvolutional c = (InputType.InputTypeConvolutional) inputType;
         InputType.InputTypeConvolutional outputType = (InputType.InputTypeConvolutional) getOutputType(-1, inputType);
 
         return new LayerMemoryReport.Builder(layerName, SpaceToDepthLayer.class, inputType, outputType)
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java
index 9eff7a91a..b2e4df6b8 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/SubsamplingLayer.java
@@ -57,6 +57,12 @@ public class SubsamplingLayer extends NoParamLayer {
     protected int pnorm;
     protected double eps;
     protected boolean cudnnAllowFallback = true;
+    /*
+    Default here for JSON deserialization of 1.0.0-beta4 and earlier models. New models default to false via builder.
+    This impacts average pooling only - whether the divisor should include or exclude padding along image edges.
+    DL4J originally included padding in the count, versions after 1.0.0-beta4 will exclude it by default.
+     */
+    protected boolean avgPoolIncludePadInDivisor = true;
 
     public enum PoolingType {
         MAX, AVG, SUM, PNORM;
@@ -95,6 +101,7 @@ public class SubsamplingLayer extends NoParamLayer {
         this.pnorm = builder.pnorm;
         this.eps = builder.eps;
         this.cudnnAllowFallback = builder.cudnnAllowFallback;
+        this.avgPoolIncludePadInDivisor = builder.avgPoolIncludePadInDivisor;
     }
 
     @Override
@@ -376,6 +383,7 @@ public class SubsamplingLayer extends NoParamLayer {
          * Whether fallback to non-CuDNN implementation should be used
          */
         protected boolean cudnnAllowFallback = true;
+        protected boolean avgPoolIncludePadInDivisor = false;
 
         protected BaseSubsamplingBuilder(PoolingType poolingType, int[] kernelSize, int[] stride) {
             this.setPoolingType(poolingType.toPoolingType());
@@ -482,6 +490,29 @@ public class SubsamplingLayer extends NoParamLayer {
             this.cudnnAllowFallback = allowFallback;
             return (T) this;
         }
+
+        /**
+         * When doing average pooling, should the padding values be included in the divisor or not?<br>
+         * Not applicable for max and p-norm pooling.<br>
+         * Users should not usually set this - instead, leave it as the default (false). It is included mainly for backward
+         * compatibility of older models<br>
+         * Consider the following 2x2 segment along the right side of the image:<br>
+         * <pre>
+         * [A, P]
+         * [B, P]
+         * </pre>
+         * Where A and B are actual values, and P is padding (0).<br>
+         * With avgPoolIncludePadInDivisor = true, we have: out = (A+B+0+0)/4<br>
+         * With avgPoolIncludePadInDivisor = false, we have: out = (A+B+0+0)/2<br>
+         * <br>
+         * Earlier versions of DL4J originally included padding in the count, newer versions exclude it.<br>
+         *
+         * @param avgPoolIncludePadInDivisor Whether the divisor should include or exclude padding for average pooling
+         */
+        public T avgPoolIncludePadInDivisor(boolean avgPoolIncludePadInDivisor){
+            this.avgPoolIncludePadInDivisor = avgPoolIncludePadInDivisor;
+            return (T) this;
+        }
     }
 
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/AbstractSameDiffLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/AbstractSameDiffLayer.java
index 74d5f450e..6cf4ae810 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/AbstractSameDiffLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/AbstractSameDiffLayer.java
@@ -35,6 +35,7 @@ import org.deeplearning4j.optimize.api.TrainingListener;
 import org.deeplearning4j.util.NetworkUtils;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.learning.config.IUpdater;
 import org.nd4j.linalg.learning.regularization.L1Regularization;
 import org.nd4j.linalg.learning.regularization.L2Regularization;
@@ -205,6 +206,22 @@ public abstract class AbstractSameDiffLayer extends Layer {
         applyGlobalConfigToLayer(b);
     }
 
+    /**
+     * This method generates an "all ones" mask array for use in the SameDiff model when none is provided.
+     * @param input Input to the layer
+     * @return A mask array - should be same datatype as the input (usually)
+     */
+    public INDArray onesMaskForInput(INDArray input){
+        if(input.rank() == 2){
+            return Nd4j.ones(input.dataType(), input.size(0), 1);
+        } else if(input.rank() == 3){
+            return Nd4j.ones(input.dataType(), input.size(0), input.size(2)); //mask: [mb, length] vs. input [mb, nIn, length]
+        } else {
+            throw new IllegalStateException("When using masking with rank 4+ inputs, the onesMaskForInput method must be implemented, " +
+                    "in order to determine the correct mask shape for this layer");
+        }
+    }
+
     @Getter
     @Setter
     public static abstract class Builder<T extends Builder<T>> extends Layer.Builder<T> {
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/SpaceToDepth.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/SpaceToDepth.java
index 5516738fc..b726ea87c 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/SpaceToDepth.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/SpaceToDepth.java
@@ -109,7 +109,7 @@ public class SpaceToDepth extends AbstractLayer<org.deeplearning4j.nn.conf.layer
 
     protected INDArray preOutput(boolean training, boolean forBackprop, LayerWorkspaceMgr workspaceMgr) {
         assertInputSet(false);
-        applyDropOutIfNecessary(training, null);
+        applyDropOutIfNecessary(training, workspaceMgr);
 
         if (input.rank() != 4) {
             throw new DL4JInvalidInputException("Got rank " + input.rank()
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/subsampling/SubsamplingLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/subsampling/SubsamplingLayer.java
index e8cef2a79..de0775875 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/subsampling/SubsamplingLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/subsampling/SubsamplingLayer.java
@@ -18,35 +18,25 @@ package org.deeplearning4j.nn.layers.convolution.subsampling;
 
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.exception.DL4JInvalidInputException;
-import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
-import org.deeplearning4j.nn.conf.layers.PoolingType;
 import org.deeplearning4j.nn.gradient.DefaultGradient;
 import org.deeplearning4j.nn.gradient.Gradient;
 import org.deeplearning4j.nn.layers.AbstractLayer;
 import org.deeplearning4j.nn.layers.LayerHelper;
 import org.deeplearning4j.nn.layers.mkldnn.MKLDNNSubsamplingHelper;
+import org.deeplearning4j.nn.workspace.ArrayType;
+import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.deeplearning4j.util.ConvolutionUtils;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
-import org.nd4j.linalg.api.ops.Op;
-import org.nd4j.linalg.api.ops.impl.layers.convolution.LegacyPooling2D;
-import org.nd4j.linalg.api.ops.impl.transforms.any.IsMax;
-import org.nd4j.linalg.api.shape.Shape;
-import org.nd4j.linalg.convolution.Convolution;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.ops.transforms.Transforms;
 import org.nd4j.linalg.primitives.Pair;
-import org.nd4j.linalg.util.ArrayUtil;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
-import org.deeplearning4j.nn.workspace.ArrayType;
 import org.nd4j.util.OneTimeLogger;
 
 import java.util.Arrays;
-import java.util.Properties;
 
 
 /**
@@ -128,16 +118,13 @@ public class SubsamplingLayer extends AbstractLayer<org.deeplearning4j.nn.conf.l
         int[] dilation = layerConf().getDilation();
 
         int[] pad;
-        int[] outSize;
-        if (convolutionMode == ConvolutionMode.Same) {
-            outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode, dilation); //Also performs validation
+        int[] outSize = new int[]{(int)input.size(2), (int)input.size(3)};    //NCHW
+        boolean same = convolutionMode == ConvolutionMode.Same;
+        if (same) {
             pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] {inH, inW}, kernel, strides, dilation);
         } else {
             pad = layerConf().getPadding();
-            outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, pad, convolutionMode, dilation); //Also performs validation
         }
-        int outH = outSize[0];
-        int outW = outSize[1];
 
         if (helper != null && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
             Pair<Gradient, INDArray> ret = null;
@@ -173,116 +160,42 @@ public class SubsamplingLayer extends AbstractLayer<org.deeplearning4j.nn.conf.l
         int inputWidth = (int) input().size(-1);
         Gradient retGradient = new DefaultGradient();
 
-        //Epsilons in shape: [miniBatch, channels, outH, outW]
-        //Epsilons out shape: [miniBatch, channels, inH, inW]
 
-        //Two possibilities here for the epsilons:
-        //(a) Epsilons come from a dense/output layer above, with c order and strides [channels*H*W, H*W, W, 1]
-        //(b) Epsilons come from CNN layer above, with c order and strides [H*W, channels*H*W, W, 1] (i.e., due to permute)
-
-        //We want to reshape epsilons to 1d here, but to do this without a copy: we end up with different orders of
-        // element in the buffer, for the "dense above" and "cnn above" cases.
-        //Fortunately, we can just permute things when we do the im2col reshaping; then, the order of the rows in
-        // col2d will match the order of the 1d epsilons...
-        //With the 1d epsilons order matching the rows order for the 2d im2col: we can just do a muliColumnVector op,
-        // instead of a slower broadcast muli op
-
-        boolean cOrderStrides = false;
-        if (epsilon.ordering() != 'c') {
-            epsilon = epsilon.dup('c');
-            cOrderStrides = true;
-        }
-        if (!cOrderStrides && Shape.strideDescendingCAscendingF(epsilon)) {
-            cOrderStrides = true;
-        } else if (!Arrays.equals(new long[] {outH * outW, inDepth * outH * outW, outW, 1}, epsilon.stride())) {
-            //Unexpected/unusual strides, not either (a) or (b) cases above
-            epsilon = epsilon.dup('c');
-            cOrderStrides = true;
-        }
-
-        INDArray col6d;
-        INDArray col6dPermuted;
-        INDArray epsilon1d;
-        if (cOrderStrides) {
-            //"Dense/Output layer above strides... i.e., standard c-order strides
-            col6d = Nd4j.create(dataType, new long[] {miniBatch, inDepth, outH, outW, kernel[0], kernel[1]}, 'c');
-            col6dPermuted = col6d.permute(0, 1, 4, 5, 2, 3);
-            epsilon1d = epsilon.reshape('c', ArrayUtil.prod(epsilon.length()), 1); //zero copy reshape
-        } else {
-            //"CNN layer above" strides...
-            col6d = Nd4j.create(dataType, new long[] {inDepth, miniBatch, outH, outW, kernel[0], kernel[1]}, 'c');
-            col6dPermuted = col6d.permute(1, 0, 4, 5, 2, 3);
-
-            INDArray epsilonTemp = epsilon.permute(1, 0, 2, 3);
-            epsilon1d = epsilonTemp.reshape('c', new int[] {ArrayUtil.prod(epsilon.length()), 1}); //Should be a zero-copy reshape always
-        }
-
-        INDArray col2d = col6d.reshape('c', miniBatch * inDepth * outH * outW, kernel[0] * kernel[1]);
-
-        switch (layerConf().getPoolingType()) {
+        INDArray epsAtInput = workspaceMgr.createUninitialized(ArrayType.ACTIVATION_GRAD, input.dataType(), input.shape(), 'c');
+        DynamicCustomOp.DynamicCustomOpsBuilder b;
+        int extra = 0;
+        switch (layerConf().getPoolingType()){
             case MAX:
-                //Execute im2col, then reshape to 2d. Note rows are in a different order for cOrderStrides true vs false cases
-                DynamicCustomOp op = DynamicCustomOp.builder("im2col")
-                        .addIntegerArguments(kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], dilation[0], dilation[1],
-                                ArrayUtil.fromBoolean(convolutionMode == ConvolutionMode.Same))
-                        .addFloatingPointArguments(minValue())
-                        .addInputs(input)
-                        .addOutputs(col6dPermuted)
-                        .build();
-                Nd4j.getExecutioner().exec(op);
-
-                INDArray isMax = Nd4j.getExecutioner().exec(new IsMax(col2d, col2d, 1));
-                isMax.muliColumnVector(epsilon1d);
+                b = DynamicCustomOp.builder("maxpool2d_bp");
                 break;
             case AVG:
-                //TODO: We could further optimize this by creating an uninitialized array, and doing a 'putiColumnVector' operation
-                // instead of a zero initialization + an addiColumnVector op
-                col2d.addiColumnVector(epsilon1d);
-                break;
-            case PNORM:
-                int pnorm = layerConf().getPnorm();
-
-                //First: do forward pass to get pNorm array
-                Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], dilation[0], dilation[1],
-                        convolutionMode == ConvolutionMode.Same, col6dPermuted);
-                INDArray pNorm = Transforms.abs(col2d, true); //dup as we need col2d again later
-                Transforms.pow(pNorm, pnorm, false);
-                pNorm = pNorm.sum(1).reshape(pNorm.size(0), 1);
-                Transforms.pow(pNorm, (1.0 / pnorm), false);
-
-                //dL/dIn = dL/dOut * dOut/dIn
-                //dOut/dIn = in .* |in|^(p-2) /  ||in||_p^(p-1), where ||in||_p is the output p-norm
-                INDArray numerator;
-                if (pnorm == 2) {
-                    numerator = col2d;
+                b = DynamicCustomOp.builder("maxpool2d_bp");
+                if(layerConf().isAvgPoolIncludePadInDivisor()){
+                    //Mostly this is a legacy case - beta4 and earlier models.
+                    extra = 1;    //Divide by "number present" excluding padding
                 } else {
-                    INDArray absp2 = Transforms.pow(Transforms.abs(col2d, true), pnorm - 2, false);
-                    numerator = col2d.muli(absp2);
+                    //Default behaviour
+                    extra = 0;    //Divide by kH*kW not "number present"
                 }
 
-                INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
-                double eps = layerConf().getEps();
-                Transforms.max(denom, eps, false); // in case of 0
-                numerator.muliColumnVector(denom.rdivi(epsilon1d));
+                break;
+            case PNORM:
+                b = DynamicCustomOp.builder("pnormpool2d_bp");
+                extra = layerConf().getPnorm();
+                b.addFloatingPointArguments(layerConf().getEps());
                 break;
             default:
-                throw new IllegalStateException("Unknown or unsupported pooling type: " + layerConf().getPoolingType()
-                        + " " + layerId());
+                throw new UnsupportedOperationException("Pooling mode not supported in SubsamplingLayer: " + layerConf().getPoolingType());
         }
 
-        //Finally: we want the output strides for the epsilons to match the strides in the activations from the layer below
-        //Assuming the layer below is a CNN layer (very likely) we want [H*W, channels*H*W, W, 1] instead of the standard
-        // c-order [channels*H*W, H*W, W, 1] strides
-        //To achieve this: [channels, miniBatch, H, W] in c order, then permute to [miniBatch, channels, H, W]
-        //This gives us proper strides of 1 on the muli...
-        INDArray tempEpsilon = workspaceMgr.create(ArrayType.ACTIVATION_GRAD, dataType, new long[] {inDepth, miniBatch, inH, inW}, 'c');
-        INDArray outEpsilon = tempEpsilon.permute(1, 0, 2, 3);
-        Convolution.col2im(col6dPermuted, outEpsilon, strides[0], strides[1], pad[0], pad[1], inputHeight, inputWidth, dilation[0], dilation[1]);
+        b.addInputs(input, epsilon)
+                .addOutputs(epsAtInput)
+                .addIntegerArguments(kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], dilation[0], dilation[1],
+                        (same ? 1 : 0), extra, 0);  //last 0 = NCHW
 
-        if (layerConf().getPoolingType() == PoolingType.AVG)
-            outEpsilon.divi(ArrayUtil.prod(layerConf().getKernelSize()));
+        Nd4j.exec(b.build());
 
-        return new Pair<>(retGradient, outEpsilon);
+        return new Pair<>(retGradient, epsAtInput);
     }
 
     private static double minValue(){
@@ -326,7 +239,8 @@ public class SubsamplingLayer extends AbstractLayer<org.deeplearning4j.nn.conf.l
         int[] dilation = layerConf().getDilation();
         int[] pad;
         int[] outSize;
-        if (convolutionMode == ConvolutionMode.Same) {
+        boolean same = convolutionMode == ConvolutionMode.Same;
+        if (same) {
             outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode, dilation); //Also performs validation
             pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] {inH, inW}, kernel, strides, dilation);
         } else {
@@ -336,6 +250,7 @@ public class SubsamplingLayer extends AbstractLayer<org.deeplearning4j.nn.conf.l
         int outH = outSize[0];
         int outW = outSize[1];
 
+
         if (helper != null && (helperCountFail == 0 || !layerConf().isCudnnAllowFallback())) {
             INDArray ret = null;
             try {
@@ -358,31 +273,34 @@ public class SubsamplingLayer extends AbstractLayer<org.deeplearning4j.nn.conf.l
             }
         }
 
-        //Similar to convolution layer forward pass: do im2col, but permute so that pooling can be done with efficient strides...
-        //Current im2col implementation expects input with shape [miniBatch,channels,kH,kW,outH,outW]
+
+
 
         INDArray output = workspaceMgr.createUninitialized(ArrayType.ACTIVATIONS, input.dataType(), new long[]{miniBatch, inDepth, outH, outW}, 'c');
-
-        LegacyPooling2D.Pooling2DType pt;
-        double extra = 0.0;
+        DynamicCustomOp.DynamicCustomOpsBuilder b;
+        int extra = 0;
         switch (layerConf().getPoolingType()){
             case MAX:
-                pt = LegacyPooling2D.Pooling2DType.MAX;
+                b = DynamicCustomOp.builder("maxpool2d");
                 break;
             case AVG:
-                pt = LegacyPooling2D.Pooling2DType.AVG;
-                extra = 1.0;    //Divide by kH*kW not "number present" to match backward pass
+                b = DynamicCustomOp.builder("maxpool2d");
+                extra = 1;    //Divide by kH*kW not "number present" to match backward pass     -- TODO change this to support both legacy behaviour (deserialized nets) and "exclude" by default for new nets
                 break;
             case PNORM:
-                pt = LegacyPooling2D.Pooling2DType.PNORM;
+                b = DynamicCustomOp.builder("pnormpool2d");
                 extra = layerConf().getPnorm();
                 break;
             default:
                 throw new UnsupportedOperationException("Not supported: " + layerConf().getPoolingType());
         }
-        Op op = new LegacyPooling2D(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], dilation[0], dilation[1],
-                convolutionMode == ConvolutionMode.Same, pt, extra, output);
-        Nd4j.getExecutioner().exec(op);
+
+        b.addInputs(input)
+                .addOutputs(output)
+                .addIntegerArguments(kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], dilation[0], dilation[1],
+                        (same ? 1 : 0), extra, 0);  //Last 0: NCHW
+
+        Nd4j.exec(b.build());
 
         return output;
     }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffLayer.java
index fd5d210c2..7ead0850c 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffLayer.java
@@ -92,6 +92,8 @@ public class SameDiffLayer extends AbstractLayer<AbstractSameDiffLayer> {
             phMap.put(INPUT_KEY, input);
             if(maskArray != null){
                 phMap.put(MASK_KEY, maskArray);
+            } else {
+                phMap.put(MASK_KEY, layerConf().onesMaskForInput(input));
             }
 
             for(String s : paramTable.keySet() ) {
@@ -139,6 +141,8 @@ public class SameDiffLayer extends AbstractLayer<AbstractSameDiffLayer> {
             phMap.put(fn.getGradPlaceholderName(), epsilon);
             if(maskArray != null){
                 phMap.put(MASK_KEY, maskArray);
+            } else {
+                phMap.put(MASK_KEY, layerConf().onesMaskForInput(input));
             }
 
             List<String> requiredGrads = new ArrayList<>(paramTable.size() + 1);