From 9efd811508afbbcbb7357735b6b5ed10166cd084 Mon Sep 17 00:00:00 2001 From: Alex Black Date: Sat, 2 Nov 2019 17:42:01 +1100 Subject: [PATCH] Use DL4J workspaces for SameDiff layers in MLN/CG (#23) * #8329 DL4J workspace integration for SameDiff layers Signed-off-by: AlexDBlack * Fix bug for Nd4j.createUninitializedDetached for scalars (length 0 shape array) Signed-off-by: AlexDBlack * SameDiff output layer, graph vertex, various fixes Signed-off-by: AlexDBlack * Javadoc Signed-off-by: AlexDBlack --- .../org/deeplearning4j/TestBatchNormBp.java | 4 +- .../nn/layers/samediff/TestSameDiffDense.java | 342 +++++++++--------- .../layers/samediff/TestSameDiffLambda.java | 256 +++++++------ .../samediff/DL4JSameDiffMemoryMgr.java | 68 ++++ .../layers/samediff/SameDiffGraphVertex.java | 243 ++++++++----- .../nn/layers/samediff/SameDiffLayer.java | 192 +++++----- .../layers/samediff/SameDiffOutputLayer.java | 183 ++++++---- .../nd4j/linalg/api/ndarray/BaseNDArray.java | 12 +- .../linalg/workspace/BasicWorkspaceTests.java | 5 + 9 files changed, 760 insertions(+), 545 deletions(-) create mode 100644 deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/DL4JSameDiffMemoryMgr.java diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java index 54a47eead..f34ce65f0 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java @@ -96,8 +96,8 @@ public class TestBatchNormBp { bn.setInput(in, LayerWorkspaceMgr.noWorkspaces()); Pair p = net.backpropGradient(eps, LayerWorkspaceMgr.noWorkspaces()); - h.preOutput(in, true, new int[]{1,3}, gamma, beta, mean, var, 0.5, e, LayerWorkspaceMgr.noWorkspaces()); - Pair pmkl = h.backpropGradient(in, eps, new int[]{1,3}, gamma, beta, dLdg, dLdb, e, LayerWorkspaceMgr.noWorkspaces()); + h.preOutput(in, true, new long[]{1,3}, gamma, beta, mean, var, 0.5, e, LayerWorkspaceMgr.noWorkspaces()); + Pair pmkl = h.backpropGradient(in, eps, new long[]{1,3}, gamma, beta, dLdg, dLdb, e, LayerWorkspaceMgr.noWorkspaces()); INDArray dldin_dl4j = p.getSecond(); diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffDense.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffDense.java index df6757608..cdea20f70 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffDense.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffDense.java @@ -80,154 +80,159 @@ public class TestSameDiffDense extends BaseDL4JTest { @Test public void testSameDiffDenseForward() { - for (int minibatch : new int[]{5, 1}) { - int nIn = 3; - int nOut = 4; + for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) { + for (int minibatch : new int[]{5, 1}) { + int nIn = 3; + int nOut = 4; - Activation[] afns = new Activation[]{ - Activation.TANH, - Activation.SIGMOID, - Activation.ELU, - Activation.IDENTITY, - Activation.SOFTPLUS, - Activation.SOFTSIGN, - Activation.CUBE, - Activation.HARDTANH, - Activation.RELU - }; + Activation[] afns = new Activation[]{ + Activation.TANH, + Activation.SIGMOID, + Activation.ELU, + Activation.IDENTITY, + Activation.SOFTPLUS, + Activation.SOFTSIGN, + Activation.CUBE, + Activation.HARDTANH, + Activation.RELU + }; - for (Activation a : afns) { - log.info("Starting test - " + a); - MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() - .list() - .layer(new SameDiffDense.Builder().nIn(nIn).nOut(nOut) - .activation(a) - .build()) - .build(); + for (Activation a : afns) { + log.info("Starting test - " + a + ", workspace = " + wsm); + MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() + .inferenceWorkspaceMode(wsm) + .trainingWorkspaceMode(wsm) + .list() + .layer(new SameDiffDense.Builder().nIn(nIn).nOut(nOut) + .activation(a) + .build()) + .build(); - MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.init(); + MultiLayerNetwork net = new MultiLayerNetwork(conf); + net.init(); - assertNotNull(net.paramTable()); + assertNotNull(net.paramTable()); - MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() - .list() - .layer(new DenseLayer.Builder().activation(a).nIn(nIn).nOut(nOut).build()) - .build(); + MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() + .list() + .layer(new DenseLayer.Builder().activation(a).nIn(nIn).nOut(nOut).build()) + .build(); - MultiLayerNetwork net2 = new MultiLayerNetwork(conf2); - net2.init(); + MultiLayerNetwork net2 = new MultiLayerNetwork(conf2); + net2.init(); - net.params().assign(net2.params()); + net.params().assign(net2.params()); - //Check params: - assertEquals(net2.params(), net.params()); - Map params1 = net.paramTable(); - Map params2 = net2.paramTable(); - assertEquals(params2, params1); + //Check params: + assertEquals(net2.params(), net.params()); + Map params1 = net.paramTable(); + Map params2 = net2.paramTable(); + assertEquals(params2, params1); - INDArray in = Nd4j.rand(minibatch, nIn); - INDArray out = net.output(in); - INDArray outExp = net2.output(in); + INDArray in = Nd4j.rand(minibatch, nIn); + INDArray out = net.output(in); + INDArray outExp = net2.output(in); - assertEquals(outExp, out); + assertEquals(outExp, out); - //Also check serialization: - MultiLayerNetwork netLoaded = TestUtils.testModelSerialization(net); - INDArray outLoaded = netLoaded.output(in); + //Also check serialization: + MultiLayerNetwork netLoaded = TestUtils.testModelSerialization(net); + INDArray outLoaded = netLoaded.output(in); - assertEquals(outExp, outLoaded); + assertEquals(outExp, outLoaded); - //Sanity check on different minibatch sizes: - INDArray newIn = Nd4j.vstack(in, in); - INDArray outMbsd = net.output(newIn); - INDArray outMb = net2.output(newIn); - assertEquals(outMb, outMbsd); + //Sanity check on different minibatch sizes: + INDArray newIn = Nd4j.vstack(in, in); + INDArray outMbsd = net.output(newIn); + INDArray outMb = net2.output(newIn); + assertEquals(outMb, outMbsd); + } } } } @Test public void testSameDiffDenseForwardMultiLayer() { - for (int minibatch : new int[]{5, 1}) { - int nIn = 3; - int nOut = 4; + for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) { + for (int minibatch : new int[]{5, 1}) { + int nIn = 3; + int nOut = 4; - Activation[] afns = new Activation[]{ - Activation.TANH, - Activation.SIGMOID, - Activation.ELU, - Activation.IDENTITY, - Activation.SOFTPLUS, - Activation.SOFTSIGN, - Activation.CUBE, //https://github.com/deeplearning4j/nd4j/issues/2426 - Activation.HARDTANH, - Activation.RELU //JVM crash - }; + Activation[] afns = new Activation[]{ + Activation.TANH, + Activation.SIGMOID, + Activation.ELU, + Activation.IDENTITY, + Activation.SOFTPLUS, + Activation.SOFTSIGN, + Activation.CUBE, //https://github.com/deeplearning4j/nd4j/issues/2426 + Activation.HARDTANH, + Activation.RELU //JVM crash + }; - for (Activation a : afns) { - log.info("Starting test - " + a); - MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() - .seed(12345) - .list() - .layer(new SameDiffDense.Builder().nIn(nIn).nOut(nOut) - .weightInit(WeightInit.XAVIER) - .activation(a).build()) - .layer(new SameDiffDense.Builder().nIn(nOut).nOut(nOut) - .weightInit(WeightInit.XAVIER) - .activation(a).build()) - .layer(new OutputLayer.Builder().nIn(nOut).nOut(nOut) - .weightInit(WeightInit.XAVIER) - .activation(a).build()) - .validateOutputLayerConfig(false) - .build(); + for (Activation a : afns) { + log.info("Starting test - " + a + " - workspace=" + wsm); + MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() + .seed(12345) + .list() + .layer(new SameDiffDense.Builder().nIn(nIn).nOut(nOut) + .weightInit(WeightInit.XAVIER) + .activation(a).build()) + .layer(new SameDiffDense.Builder().nIn(nOut).nOut(nOut) + .weightInit(WeightInit.XAVIER) + .activation(a).build()) + .layer(new OutputLayer.Builder().nIn(nOut).nOut(nOut) + .weightInit(WeightInit.XAVIER) + .activation(a).build()) + .validateOutputLayerConfig(false) + .build(); - MultiLayerNetwork net = new MultiLayerNetwork(conf); - net.init(); + MultiLayerNetwork net = new MultiLayerNetwork(conf); + net.init(); - assertNotNull(net.paramTable()); + assertNotNull(net.paramTable()); - MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() - .seed(12345) - .weightInit(WeightInit.XAVIER) - .list() - .layer(new DenseLayer.Builder().activation(a).nIn(nIn).nOut(nOut).build()) - .layer(new DenseLayer.Builder().activation(a).nIn(nOut).nOut(nOut).build()) - .layer(new OutputLayer.Builder().nIn(nOut).nOut(nOut) - .activation(a).build()) - .validateOutputLayerConfig(false) - .build(); + MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() + .seed(12345) + .weightInit(WeightInit.XAVIER) + .list() + .layer(new DenseLayer.Builder().activation(a).nIn(nIn).nOut(nOut).build()) + .layer(new DenseLayer.Builder().activation(a).nIn(nOut).nOut(nOut).build()) + .layer(new OutputLayer.Builder().nIn(nOut).nOut(nOut) + .activation(a).build()) + .validateOutputLayerConfig(false) + .build(); - MultiLayerNetwork net2 = new MultiLayerNetwork(conf2); - net2.init(); + MultiLayerNetwork net2 = new MultiLayerNetwork(conf2); + net2.init(); -// net.params().assign(net2.params()); - assertEquals(net2.params(), net.params()); + assertEquals(net2.params(), net.params()); - //Check params: - assertEquals(net2.params(), net.params()); - Map params1 = net.paramTable(); - Map params2 = net2.paramTable(); - assertEquals(params2, params1); + //Check params: + assertEquals(net2.params(), net.params()); + Map params1 = net.paramTable(); + Map params2 = net2.paramTable(); + assertEquals(params2, params1); - INDArray in = Nd4j.rand(minibatch, nIn); - INDArray out = net.output(in); - INDArray outExp = net2.output(in); + INDArray in = Nd4j.rand(minibatch, nIn); + INDArray out = net.output(in); + INDArray outExp = net2.output(in); - assertEquals(outExp, out); + assertEquals(outExp, out); - //Also check serialization: - MultiLayerNetwork netLoaded = TestUtils.testModelSerialization(net); - INDArray outLoaded = netLoaded.output(in); + //Also check serialization: + MultiLayerNetwork netLoaded = TestUtils.testModelSerialization(net); + INDArray outLoaded = netLoaded.output(in); - assertEquals(outExp, outLoaded); + assertEquals(outExp, outLoaded); - //Sanity check different minibatch sizes - in = Nd4j.rand(2 * minibatch, nIn); - out = net.output(in); - outExp = net2.output(in); - assertEquals(outExp, out); + //Sanity check different minibatch sizes + in = Nd4j.rand(2 * minibatch, nIn); + out = net.output(in); + outExp = net2.output(in); + assertEquals(outExp, out); + } } } } @@ -244,10 +249,13 @@ public class TestSameDiffDense extends BaseDL4JTest { Activation[] afns = new Activation[]{ Activation.TANH, Activation.SIGMOID, - Activation.ELU, Activation.IDENTITY, Activation.SOFTPLUS, Activation.SOFTSIGN, + Activation.ELU, + Activation.IDENTITY, + Activation.SOFTPLUS, + Activation.SOFTSIGN, Activation.HARDTANH, - Activation.CUBE, //https://github.com/deeplearning4j/nd4j/issues/2426 - Activation.RELU //JVM crash + Activation.CUBE, + Activation.RELU }; for (Activation a : afns) { @@ -337,64 +345,66 @@ public class TestSameDiffDense extends BaseDL4JTest { int nIn = 4; int nOut = 3; - boolean workspaces = true; - MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() - .seed(12345) - .trainingWorkspaceMode(workspaces ? WorkspaceMode.ENABLED : WorkspaceMode.NONE) - .inferenceWorkspaceMode(workspaces ? WorkspaceMode.ENABLED : WorkspaceMode.NONE) - .updater(new Adam(0.1)) - .list() - .layer(new SameDiffDense.Builder().nIn(nIn).nOut(5).activation(Activation.TANH).build()) - .layer(new SameDiffDense.Builder().nIn(5).nOut(5).activation(Activation.TANH).build()) - .layer(new OutputLayer.Builder().nIn(5).nOut(nOut).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build()) - .build(); + for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) { - MultiLayerNetwork netSD = new MultiLayerNetwork(conf); - netSD.init(); + MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() + .seed(12345) + .trainingWorkspaceMode(wsm) + .inferenceWorkspaceMode(wsm) + .updater(new Adam(0.1)) + .list() + .layer(new SameDiffDense.Builder().nIn(nIn).nOut(5).activation(Activation.TANH).build()) + .layer(new SameDiffDense.Builder().nIn(5).nOut(5).activation(Activation.TANH).build()) + .layer(new OutputLayer.Builder().nIn(5).nOut(nOut).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build()) + .build(); - MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() - .seed(12345) - .updater(new Adam(0.1)) - .list() - .layer(new DenseLayer.Builder().activation(Activation.TANH).nIn(nIn).nOut(5).build()) - .layer(new DenseLayer.Builder().activation(Activation.TANH).nIn(5).nOut(5).build()) - .layer(new OutputLayer.Builder().nIn(5).nOut(nOut).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build()) - .build(); + MultiLayerNetwork netSD = new MultiLayerNetwork(conf); + netSD.init(); - MultiLayerNetwork netStandard = new MultiLayerNetwork(conf2); - netStandard.init(); + MultiLayerConfiguration conf2 = new NeuralNetConfiguration.Builder() + .seed(12345) + .updater(new Adam(0.1)) + .list() + .layer(new DenseLayer.Builder().activation(Activation.TANH).nIn(nIn).nOut(5).build()) + .layer(new DenseLayer.Builder().activation(Activation.TANH).nIn(5).nOut(5).build()) + .layer(new OutputLayer.Builder().nIn(5).nOut(nOut).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build()) + .build(); - netSD.params().assign(netStandard.params()); + MultiLayerNetwork netStandard = new MultiLayerNetwork(conf2); + netStandard.init(); - //Check params: - assertEquals(netStandard.params(), netSD.params()); - assertEquals(netStandard.paramTable(), netSD.paramTable()); + netSD.params().assign(netStandard.params()); - DataSetIterator iter = new IrisDataSetIterator(150,150); - DataSet ds = iter.next(); + //Check params: + assertEquals(netStandard.params(), netSD.params()); + assertEquals(netStandard.paramTable(), netSD.paramTable()); - INDArray outSD = netSD.output(ds.getFeatures()); - INDArray outStd = netStandard.output(ds.getFeatures()); + DataSetIterator iter = new IrisDataSetIterator(150, 150); + DataSet ds = iter.next(); - assertEquals(outStd, outSD); + INDArray outSD = netSD.output(ds.getFeatures()); + INDArray outStd = netStandard.output(ds.getFeatures()); - for( int i=0; i<3; i++ ){ - netSD.fit(ds); - netStandard.fit(ds); - String s = String.valueOf(i); - assertEquals(s, netStandard.getFlattenedGradients(), netSD.getFlattenedGradients()); - assertEquals(s, netStandard.params(), netSD.params()); - assertEquals(s, netStandard.getUpdater().getStateViewArray(), netSD.getUpdater().getStateViewArray()); + assertEquals(outStd, outSD); + + for (int i = 0; i < 3; i++) { + netSD.fit(ds); + netStandard.fit(ds); + String s = String.valueOf(i); + assertEquals(s, netStandard.getFlattenedGradients(), netSD.getFlattenedGradients()); + assertEquals(s, netStandard.params(), netSD.params()); + assertEquals(s, netStandard.getUpdater().getStateViewArray(), netSD.getUpdater().getStateViewArray()); + } + + //Sanity check on different minibatch sizes: + INDArray newIn = Nd4j.vstack(ds.getFeatures(), ds.getFeatures()); + INDArray outMbsd = netSD.output(newIn); + INDArray outMb = netStandard.output(newIn); + assertEquals(outMb, outMbsd); } - - //Sanity check on different minibatch sizes: - INDArray newIn = Nd4j.vstack(ds.getFeatures(), ds.getFeatures()); - INDArray outMbsd = netSD.output(newIn); - INDArray outMb = netStandard.output(newIn); - assertEquals(outMb, outMbsd); } @Test @@ -402,7 +412,7 @@ public class TestSameDiffDense extends BaseDL4JTest { int nIn = 4; int nOut = 4; - for (boolean workspaces : new boolean[]{false, true}) { + for (boolean workspaces : new boolean[]{true, false}) { for (Activation a : new Activation[]{Activation.TANH, Activation.IDENTITY}) { String msg = "workspaces: " + workspaces + ", " + a; diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffLambda.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffLambda.java index 6264aaf72..8368d3869 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffLambda.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/TestSameDiffLambda.java @@ -21,6 +21,7 @@ import org.deeplearning4j.BaseDL4JTest; import org.deeplearning4j.TestUtils; import org.deeplearning4j.nn.conf.ComputationGraphConfiguration; import org.deeplearning4j.nn.conf.NeuralNetConfiguration; +import org.deeplearning4j.nn.conf.WorkspaceMode; import org.deeplearning4j.nn.conf.graph.ElementWiseVertex; import org.deeplearning4j.nn.conf.graph.ScaleVertex; import org.deeplearning4j.nn.conf.graph.ShiftVertex; @@ -52,152 +53,169 @@ public class TestSameDiffLambda extends BaseDL4JTest { @Test public void testSameDiffLamdaLayerBasic(){ - Nd4j.getRandom().setSeed(12345); - ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder() - .seed(12345) - .updater(new Adam(0.01)) - .graphBuilder() - .addInputs("in") - .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in") - .addLayer("1", new SameDiffSimpleLambdaLayer(), "0") - .addLayer("2", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "1") - .setOutputs("2") - .build(); + for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) { + log.info("--- Workspace Mode: {} ---", wsm); - //Equavalent, not using SameDiff Lambda: - ComputationGraphConfiguration confStd = new NeuralNetConfiguration.Builder() - .seed(12345) - .updater(new Adam(0.01)) - .graphBuilder() - .addInputs("in") - .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in") - .addVertex("1", new ShiftVertex(1.0), "0") - .addVertex("2", new ScaleVertex(2.0), "1") - .addLayer("3", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "2") - .setOutputs("3") - .build(); - ComputationGraph lambda = new ComputationGraph(conf); - lambda.init(); + Nd4j.getRandom().setSeed(12345); + ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder() + .trainingWorkspaceMode(wsm) + .inferenceWorkspaceMode(wsm) + .seed(12345) + .updater(new Adam(0.01)) + .graphBuilder() + .addInputs("in") + .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in") + .addLayer("1", new SameDiffSimpleLambdaLayer(), "0") + .addLayer("2", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "1") + .setOutputs("2") + .build(); - ComputationGraph std = new ComputationGraph(confStd); - std.init(); + //Equavalent, not using SameDiff Lambda: + ComputationGraphConfiguration confStd = new NeuralNetConfiguration.Builder() + .trainingWorkspaceMode(wsm) + .inferenceWorkspaceMode(wsm) + .seed(12345) + .updater(new Adam(0.01)) + .graphBuilder() + .addInputs("in") + .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in") + .addVertex("1", new ShiftVertex(1.0), "0") + .addVertex("2", new ScaleVertex(2.0), "1") + .addLayer("3", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "2") + .setOutputs("3") + .build(); - lambda.setParams(std.params()); + ComputationGraph lambda = new ComputationGraph(conf); + lambda.init(); - INDArray in = Nd4j.rand(3,5); - INDArray labels = TestUtils.randomOneHot(3, 5); - DataSet ds = new DataSet(in, labels); + ComputationGraph std = new ComputationGraph(confStd); + std.init(); - INDArray outLambda = lambda.outputSingle(in); - INDArray outStd = std.outputSingle(in); + lambda.setParams(std.params()); - assertEquals(outLambda, outStd); + INDArray in = Nd4j.rand(3, 5); + INDArray labels = TestUtils.randomOneHot(3, 5); + DataSet ds = new DataSet(in, labels); - double scoreLambda = lambda.score(ds); - double scoreStd = std.score(ds); + INDArray outLambda = lambda.outputSingle(in); + INDArray outStd = std.outputSingle(in); - assertEquals(scoreStd, scoreLambda, 1e-6); + assertEquals(outLambda, outStd); - for( int i=0; i<3; i++ ){ - lambda.fit(ds); - std.fit(ds); + double scoreLambda = lambda.score(ds); + double scoreStd = std.score(ds); - String s = String.valueOf(i); - assertEquals(s, std.params(), lambda.params()); - assertEquals(s, std.getFlattenedGradients(), lambda.getFlattenedGradients()); + assertEquals(scoreStd, scoreLambda, 1e-6); + + for (int i = 0; i < 3; i++) { + lambda.fit(ds); + std.fit(ds); + + String s = String.valueOf(i); + assertEquals(s, std.params(), lambda.params()); + assertEquals(s, std.getFlattenedGradients(), lambda.getFlattenedGradients()); + } + + ComputationGraph loaded = TestUtils.testModelSerialization(lambda); + outLambda = loaded.outputSingle(in); + outStd = std.outputSingle(in); + + assertEquals(outStd, outLambda); + + //Sanity check on different minibatch sizes: + INDArray newIn = Nd4j.vstack(in, in); + INDArray outMbsd = lambda.output(newIn)[0]; + INDArray outMb = std.output(newIn)[0]; + assertEquals(outMb, outMbsd); } - - ComputationGraph loaded = TestUtils.testModelSerialization(lambda); - outLambda = loaded.outputSingle(in); - outStd = std.outputSingle(in); - - assertEquals(outStd, outLambda); - - //Sanity check on different minibatch sizes: - INDArray newIn = Nd4j.vstack(in, in); - INDArray outMbsd = lambda.output(newIn)[0]; - INDArray outMb = std.output(newIn)[0]; - assertEquals(outMb, outMbsd); } @Test public void testSameDiffLamdaVertexBasic(){ - Nd4j.getRandom().setSeed(12345); - ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder() - .dataType(DataType.DOUBLE) - .seed(12345) - .updater(new Adam(0.01)) - .graphBuilder() - .addInputs("in1", "in2") - .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in1") - .addLayer("1", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in2") - .addVertex("lambda", new SameDiffSimpleLambdaVertex(), "0", "1") - .addLayer("2", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "lambda") - .setOutputs("2") - .build(); + for(WorkspaceMode wsm : new WorkspaceMode[]{WorkspaceMode.ENABLED, WorkspaceMode.NONE}) { + log.info("--- Workspace Mode: {} ---", wsm); - //Equavalent, not using SameDiff Lambda: - ComputationGraphConfiguration confStd = new NeuralNetConfiguration.Builder() - .dataType(DataType.DOUBLE) - .seed(12345) - .updater(new Adam(0.01)) - .graphBuilder() - .addInputs("in1", "in2") - .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in1") - .addLayer("1", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in2") - .addVertex("elementwise", new ElementWiseVertex(ElementWiseVertex.Op.Product), "0", "1") - .addLayer("3", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) - .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "elementwise") - .setOutputs("3") - .build(); + Nd4j.getRandom().setSeed(12345); + ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder() + .trainingWorkspaceMode(wsm) + .inferenceWorkspaceMode(wsm) + .dataType(DataType.DOUBLE) + .seed(12345) + .updater(new Adam(0.01)) + .graphBuilder() + .addInputs("in1", "in2") + .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in1") + .addLayer("1", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in2") + .addVertex("lambda", new SameDiffSimpleLambdaVertex(), "0", "1") + .addLayer("2", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "lambda") + .setOutputs("2") + .build(); - ComputationGraph lambda = new ComputationGraph(conf); - lambda.init(); + //Equavalent, not using SameDiff Lambda: + ComputationGraphConfiguration confStd = new NeuralNetConfiguration.Builder() + .trainingWorkspaceMode(wsm) + .inferenceWorkspaceMode(wsm) + .dataType(DataType.DOUBLE) + .seed(12345) + .updater(new Adam(0.01)) + .graphBuilder() + .addInputs("in1", "in2") + .addLayer("0", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in1") + .addLayer("1", new DenseLayer.Builder().nIn(5).nOut(5).activation(Activation.TANH).build(), "in2") + .addVertex("elementwise", new ElementWiseVertex(ElementWiseVertex.Op.Product), "0", "1") + .addLayer("3", new OutputLayer.Builder().nIn(5).nOut(5).activation(Activation.SOFTMAX) + .lossFunction(LossFunctions.LossFunction.MCXENT).build(), "elementwise") + .setOutputs("3") + .build(); - ComputationGraph std = new ComputationGraph(confStd); - std.init(); + ComputationGraph lambda = new ComputationGraph(conf); + lambda.init(); - lambda.setParams(std.params()); + ComputationGraph std = new ComputationGraph(confStd); + std.init(); - INDArray in1 = Nd4j.rand(3,5); - INDArray in2 = Nd4j.rand(3,5); - INDArray labels = TestUtils.randomOneHot(3, 5); - MultiDataSet mds = new org.nd4j.linalg.dataset.MultiDataSet(new INDArray[]{in1, in2}, new INDArray[]{labels}); + lambda.setParams(std.params()); - INDArray outLambda = lambda.output(in1, in2)[0]; - INDArray outStd = std.output(in1, in2)[0]; + INDArray in1 = Nd4j.rand(3, 5); + INDArray in2 = Nd4j.rand(3, 5); + INDArray labels = TestUtils.randomOneHot(3, 5); + MultiDataSet mds = new org.nd4j.linalg.dataset.MultiDataSet(new INDArray[]{in1, in2}, new INDArray[]{labels}); - assertEquals(outLambda, outStd); + INDArray outLambda = lambda.output(in1, in2)[0]; + INDArray outStd = std.output(in1, in2)[0]; - double scoreLambda = lambda.score(mds); - double scoreStd = std.score(mds); + assertEquals(outLambda, outStd); - assertEquals(scoreStd, scoreLambda, 1e-6); + double scoreLambda = lambda.score(mds); + double scoreStd = std.score(mds); - for( int i=0; i<3; i++ ){ - lambda.fit(mds); - std.fit(mds); + assertEquals(scoreStd, scoreLambda, 1e-6); - String s = String.valueOf(i); - assertEquals(s, std.params(), lambda.params()); - assertEquals(s, std.getFlattenedGradients(), lambda.getFlattenedGradients()); + for (int i = 0; i < 3; i++) { + lambda.fit(mds); + std.fit(mds); + + String s = String.valueOf(i); + assertEquals(s, std.params(), lambda.params()); + assertEquals(s, std.getFlattenedGradients(), lambda.getFlattenedGradients()); + } + + ComputationGraph loaded = TestUtils.testModelSerialization(lambda); + outLambda = loaded.output(in1, in2)[0]; + outStd = std.output(in1, in2)[0]; + + assertEquals(outStd, outLambda); + + //Sanity check on different minibatch sizes: + INDArray newIn1 = Nd4j.vstack(in1, in1); + INDArray newIn2 = Nd4j.vstack(in2, in2); + INDArray outMbsd = lambda.output(newIn1, newIn2)[0]; + INDArray outMb = std.output(newIn1, newIn2)[0]; + assertEquals(outMb, outMbsd); } - - ComputationGraph loaded = TestUtils.testModelSerialization(lambda); - outLambda = loaded.output(in1, in2)[0]; - outStd = std.output(in1, in2)[0]; - - assertEquals(outStd, outLambda); - - //Sanity check on different minibatch sizes: - INDArray newIn1 = Nd4j.vstack(in1, in1); - INDArray newIn2 = Nd4j.vstack(in2, in2); - INDArray outMbsd = lambda.output(newIn1, newIn2)[0]; - INDArray outMb = std.output(newIn1, newIn2)[0]; - assertEquals(outMb, outMbsd); } } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/DL4JSameDiffMemoryMgr.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/DL4JSameDiffMemoryMgr.java new file mode 100644 index 000000000..c727f73c4 --- /dev/null +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/DL4JSameDiffMemoryMgr.java @@ -0,0 +1,68 @@ +package org.deeplearning4j.nn.layers.samediff; + +import org.nd4j.autodiff.samediff.internal.memory.AbstractMemoryMgr; +import org.nd4j.base.Preconditions; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.memory.MemoryWorkspace; +import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.shape.LongShapeDescriptor; +import org.nd4j.linalg.factory.Nd4j; + +/** + * A SameDiff {@link org.nd4j.autodiff.samediff.internal.SessionMemMgr} that uses DL4J workspaces for memory management. + * Any op outputs are allocated in the output workspace if they are returned to the layer; otherwise they are placed in + * the DL4J working memory workspace + * + * @author Alex Black + */ +public class DL4JSameDiffMemoryMgr extends AbstractMemoryMgr { + + private final String workingMemoryWs; + private final String outputWs; + private final WorkspaceConfiguration confWorking; + private final WorkspaceConfiguration confOutput; + + //Note: if the working memory or output workspace names are null -> detached memory + public DL4JSameDiffMemoryMgr(String workingMemoryWs, String outputWs, WorkspaceConfiguration confWorking, + WorkspaceConfiguration confOutput){ + this.workingMemoryWs = workingMemoryWs; + this.outputWs = outputWs; + this.confWorking = confWorking; + this.confOutput = confOutput; + } + + + @Override + public INDArray allocate(boolean detached, DataType dataType, long... shape) { + String wsName = detached ? outputWs : workingMemoryWs; + WorkspaceConfiguration wsConf = detached ? confOutput : confWorking; + + if(wsName == null){ + //Scoped out + INDArray ret = Nd4j.createUninitializedDetached(dataType, shape); + Preconditions.checkState(!ret.isAttached(), "Returned array should be detached"); + return ret; + } else { + MemoryWorkspace ws = Nd4j.getWorkspaceManager().getWorkspaceForCurrentThread(wsConf, wsName); + try (MemoryWorkspace mw = ws.notifyScopeBorrowed()) { + return Nd4j.createUninitialized(dataType, shape); + } + } + } + + @Override + public INDArray allocate(boolean detached, LongShapeDescriptor descriptor) { + return allocate(detached, descriptor.dataType(), descriptor.getShape()); + } + + @Override + public void release(INDArray array) { + //No-op - DL4J workspaces handles this + } + + @Override + public void close() { + //No-op - DL4J workspaces handles this + } +} diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffGraphVertex.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffGraphVertex.java index 3e1d1b831..1d2abe2b6 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffGraphVertex.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffGraphVertex.java @@ -31,9 +31,12 @@ import org.deeplearning4j.nn.workspace.ArrayType; import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr; import org.nd4j.autodiff.samediff.SDVariable; import org.nd4j.autodiff.samediff.SameDiff; +import org.nd4j.autodiff.samediff.internal.InferenceSession; +import org.nd4j.autodiff.samediff.internal.SessionMemMgr; import org.nd4j.base.Preconditions; import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.memory.MemoryWorkspace; +import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.impl.layers.ExternalErrorsFunction; import org.nd4j.linalg.factory.Nd4j; @@ -95,119 +98,159 @@ public class SameDiffGraphVertex extends BaseGraphVertex { @Override public INDArray doForward(boolean training, LayerWorkspaceMgr workspaceMgr) { try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) { - if(sameDiff == null){ + if (sameDiff == null) { doInit(); } - - Map phMap = new HashMap<>(); - config.validateInput(inputs); - for(int i=0; i 0) { - //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration - //TODO Find a more efficient solution for this - for (Map.Entry e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - } - INDArray result = sameDiff.outputSingle(phMap, outputKey); - - //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere - sameDiff.clearPlaceholders(true); - sameDiff.clearOpInputs(); - return workspaceMgr.dup(ArrayType.ACTIVATIONS, result); } + + Map phMap = new HashMap<>(); + config.validateInput(inputs); + for(int i=0; i 0) { + //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration + //TODO Find a more efficient solution for this + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + } + INDArray result = sameDiff.outputSingle(phMap, outputKey); + + //Edge case: "vertex" is just an identity activation, for example + //TODO there may be a cleaner way to do this... + if(!actScopedOut && !result.data().getParentWorkspace().getId().equals(wsNameOutput)){ + result = workspaceMgr.dup(ArrayType.ACTIVATIONS, result); + } else if(actScopedOut && result.isAttached()){ + result = result.detach(); + } + + //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere + sameDiff.clearPlaceholders(true); + sameDiff.clearOpInputs(); + return workspaceMgr.dup(ArrayType.ACTIVATIONS, result); } @Override public Pair doBackward(boolean tbptt, LayerWorkspaceMgr workspaceMgr) { Gradient g = new DefaultGradient(); - INDArray[] dLdIns; - boolean[] noClose = new boolean[getNumInputArrays()]; - try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()){ - if(sameDiff == null){ + try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) { + if (sameDiff == null) { doInit(); } - - List inputNames = config.getVertexParams().getInputs(); - if(!sameDiff.hasGradientFunction()) { - //Create when scoped out, to ensure any arrays are not in WS - String[] inArr = inputNames.toArray(new String[inputNames.size()]); - sameDiff.createGradFunction(inArr); - } - config.validateInput(inputs); - Map phMap = new HashMap<>(); - List inputs = config.getVertexParams().getInputs(); - int i=0; - for(String s : inputs){ - phMap.put(s, this.inputs[i++]); - } - for( int j=0; j required = new ArrayList<>(inputNames.size()); //Ensure that the input placeholder gradients are calculated - for (Map.Entry e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - - required.addAll(paramTable.keySet()); - required.addAll(inputNames); - - Map gradsMap = sameDiff.calculateGradients(phMap, required); - for(String s : paramTable.keySet() ){ - INDArray sdGrad = gradsMap.get(s); - INDArray dl4jGrad = gradTable.get(s); - dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS - sdGrad.close(); //TODO optimize this - g.gradientForVariable().put(s, dl4jGrad); - } - - dLdIns = new INDArray[inputs.size()]; - String fnName = fn.getGradPlaceholderName(); - for(int j=0; j inputNames = config.getVertexParams().getInputs(); + if(!sameDiff.hasGradientFunction()) { + //Create when scoped out, to ensure any arrays are not in WS + String[] inArr = inputNames.toArray(new String[inputNames.size()]); + sameDiff.createGradFunction(inArr); + } + config.validateInput(inputs); + + //Configure memory management for SameDiff instance - use DL4J workspaces + Map sessionMap = sameDiff.getFunction("grad").getSessions(); + if(!sessionMap.containsKey(Thread.currentThread().getId())){ + sessionMap.put(Thread.currentThread().getId(), new InferenceSession(sameDiff.getFunction("grad"))); + } + String wsNameWorking = workspaceMgr.getWorkspaceName(ArrayType.BP_WORKING_MEM); + String wsNameActGrad = workspaceMgr.getWorkspaceName(ArrayType.ACTIVATION_GRAD); + WorkspaceConfiguration confWorking = workspaceMgr.getConfiguration(ArrayType.BP_WORKING_MEM); + WorkspaceConfiguration confOutput = workspaceMgr.getConfiguration(ArrayType.ACTIVATION_GRAD); + + boolean actGradScopedOut = workspaceMgr.isScopedOut(ArrayType.ACTIVATION_GRAD); + Preconditions.checkState(actGradScopedOut || wsNameActGrad != null, "Activation gradients must have a workspace or be scoped out"); + SessionMemMgr mmgr = new DL4JSameDiffMemoryMgr(wsNameWorking, wsNameActGrad, confWorking, confOutput); + sessionMap.get(Thread.currentThread().getId()).setMmgr(mmgr); + + + + Map phMap = new HashMap<>(); + List inputs = config.getVertexParams().getInputs(); + int i=0; + for(String s : inputs){ + phMap.put(s, this.inputs[i++]); + } + for( int j=0; j required = new ArrayList<>(inputNames.size()); //Ensure that the input placeholder gradients are calculated + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + + required.addAll(paramTable.keySet()); + required.addAll(inputNames); + + Map gradsMap = sameDiff.calculateGradients(phMap, required); + for(String s : paramTable.keySet() ){ + INDArray sdGrad = gradsMap.get(s); + INDArray dl4jGrad = gradTable.get(s); + dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS + g.gradientForVariable().put(s, dl4jGrad); + } + + INDArray[] dLdIns = new INDArray[inputs.size()]; + String fnName = fn.getGradPlaceholderName(); + for(int j=0; j { assertInputSet(false); try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) { - if(sameDiff == null){ + if (sameDiff == null) { doInit(); } - - org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer bl = (org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer) layerConf(); - bl.validateInput(input); - - Map phMap = new HashMap<>(); - phMap.put(INPUT_KEY, input); - if(maskArray != null){ - phMap.put(MASK_KEY, maskArray); - } else { - phMap.put(MASK_KEY, layerConf().onesMaskForInput(input)); - } - - //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration - //TODO Find a more efficient solution for this - for (Map.Entry e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - - Map out = sameDiff.output(phMap, outputKey); - INDArray result = out.get(outputKey); - - //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere - sameDiff.clearPlaceholders(true); - sameDiff.clearOpInputs(); - - INDArray ret = workspaceMgr.dup(ArrayType.ACTIVATIONS, result); - if(!result.isAttached() && result.closeable()) { - //May be attached in rare edge case - for identity, or if gradients are passed through from output to input - // unchaned, as in identity, add scalar, etc - result.close(); - } - return ret; } + + org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer bl = (org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer) layerConf(); + bl.validateInput(input); + + Map phMap = new HashMap<>(); + phMap.put(INPUT_KEY, input); + if(maskArray != null){ + phMap.put(MASK_KEY, maskArray); + } else { + phMap.put(MASK_KEY, layerConf().onesMaskForInput(input)); + } + + //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration + //TODO Find a more efficient solution for this + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + + //Configure memory management for SameDiff instance - use DL4J workspaces + String wsNameWorking = workspaceMgr.getWorkspaceName(ArrayType.FF_WORKING_MEM); + String wsNameOutput = workspaceMgr.getWorkspaceName(ArrayType.ACTIVATIONS); + WorkspaceConfiguration confWorking = workspaceMgr.getConfiguration(ArrayType.FF_WORKING_MEM); + WorkspaceConfiguration confOutput = workspaceMgr.getConfiguration(ArrayType.ACTIVATIONS); + boolean actScopedOut = workspaceMgr.isScopedOut(ArrayType.ACTIVATIONS); + Preconditions.checkState(actScopedOut || wsNameOutput != null, "Activations must have a workspace or must be scoped out"); + SessionMemMgr mmgr = new DL4JSameDiffMemoryMgr(wsNameWorking, wsNameOutput, confWorking, confOutput); + + InferenceSession is = sameDiff.getSessions().get(Thread.currentThread().getId()); + if(is == null){ + is = new InferenceSession(sameDiff); + sameDiff.getSessions().put(Thread.currentThread().getId(), is); + } + is.setMmgr(mmgr); + + Map out = sameDiff.output(phMap, outputKey); + INDArray result = out.get(outputKey); + + //Edge case - identity activation + //TODO there may be a cleaner way to do this... + if(!actScopedOut && !result.data().getParentWorkspace().getId().equals(wsNameOutput)){ + result = workspaceMgr.dup(ArrayType.ACTIVATIONS, result); + } else if(actScopedOut && result.isAttached()){ + result = result.detach(); + } + + + //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere + sameDiff.clearPlaceholders(true); + sameDiff.clearOpInputs(); + + return result; } @@ -128,67 +150,71 @@ public class SameDiffLayer extends AbstractLayer { Gradient g = new DefaultGradient(); INDArray dLdIn; - boolean noCloseEps = false; - try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()){ - if(sameDiff == null){ + + try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) { + if (sameDiff == null) { doInit(); } - if(!sameDiff.hasGradientFunction()) { + if (!sameDiff.hasGradientFunction()) { //Create when scoped out, to ensure any arrays are not in WS sameDiff.createGradFunction(INPUT_KEY); } - - org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer bl = (org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer) layerConf(); - bl.validateInput(input); - - //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration - //TODO Find a more efficient solution for this - for (Map.Entry e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - - Map phMap = new HashMap<>(); - phMap.put(INPUT_KEY, input); - phMap.put(fn.getGradPlaceholderName(), epsilon); - if(maskArray != null){ - phMap.put(MASK_KEY, maskArray); - } else { - phMap.put(MASK_KEY, layerConf().onesMaskForInput(input)); - } - - List requiredGrads = new ArrayList<>(paramTable.size() + 1); - requiredGrads.add(INPUT_KEY); - requiredGrads.addAll(paramTable.keySet()); - - Map m = sameDiff.calculateGradients(phMap, requiredGrads); - for(String s : paramTable.keySet() ){ - INDArray sdGrad = m.get(s); - INDArray dl4jGrad = gradTable.get(s); - dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS - g.gradientForVariable().put(s, dl4jGrad); - sdGrad.close(); - } - - dLdIn = m.get(INPUT_KEY); - - if(dLdIn == null && fn.getGradPlaceholderName().equals(INPUT_KEY)){ - //Edge case with lambda layers like identity: SameDiff doesn't store the placeholders - // So, this getArr() can be trying to get placeholder from SameDiff instance, when it's available here - dLdIn = epsilon; - noCloseEps = true; - } } + //Configure memory management for SameDiff instance - use DL4J workspaces + Map sessionMap = sameDiff.getFunction("grad").getSessions(); + if(!sessionMap.containsKey(Thread.currentThread().getId())){ + sessionMap.put(Thread.currentThread().getId(), new InferenceSession(sameDiff.getFunction("grad"))); + } + String wsNameWorking = workspaceMgr.getWorkspaceName(ArrayType.BP_WORKING_MEM); + String wsNameActGrad = workspaceMgr.getWorkspaceName(ArrayType.ACTIVATION_GRAD); + WorkspaceConfiguration confWorking = workspaceMgr.getConfiguration(ArrayType.BP_WORKING_MEM); + WorkspaceConfiguration confOutput = workspaceMgr.getConfiguration(ArrayType.ACTIVATION_GRAD); + + boolean actGradScopedOut = workspaceMgr.isScopedOut(ArrayType.ACTIVATION_GRAD); + Preconditions.checkState(actGradScopedOut || wsNameActGrad != null, "Activation gradients must have a workspace or be scoped out"); + SessionMemMgr mmgr = new DL4JSameDiffMemoryMgr(wsNameWorking, wsNameActGrad, confWorking, confOutput); + sessionMap.get(Thread.currentThread().getId()).setMmgr(mmgr); + + + org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer bl = (org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer) layerConf(); + bl.validateInput(input); + + //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration + //TODO Find a more efficient solution for this + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + + Map phMap = new HashMap<>(); + phMap.put(INPUT_KEY, input); + phMap.put(fn.getGradPlaceholderName(), epsilon); + if(maskArray != null){ + phMap.put(MASK_KEY, maskArray); + } else { + phMap.put(MASK_KEY, layerConf().onesMaskForInput(input)); + } + + List requiredGrads = new ArrayList<>(paramTable.size() + 1); + requiredGrads.add(INPUT_KEY); + requiredGrads.addAll(paramTable.keySet()); + + Map m = sameDiff.calculateGradients(phMap, requiredGrads); + for(String s : paramTable.keySet() ){ + INDArray sdGrad = m.get(s); + INDArray dl4jGrad = gradTable.get(s); + dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS + g.gradientForVariable().put(s, dl4jGrad); + } + + dLdIn = m.get(INPUT_KEY); + //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere sameDiff.clearPlaceholders(true); sameDiff.clearOpInputs(); Pair ret = new Pair<>(g, workspaceMgr.dup(ArrayType.ACTIVATION_GRAD, dLdIn)); //TODO OPTIMIZE THIS - if(!noCloseEps && !dLdIn.isAttached() && dLdIn.closeable()) { - //Edge case: identity etc - might just pass gradient array through unchanged - dLdIn.close(); - } return ret; } diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffOutputLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffOutputLayer.java index e6d9c2a7e..35c44d17d 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffOutputLayer.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/samediff/SameDiffOutputLayer.java @@ -29,9 +29,12 @@ import org.deeplearning4j.nn.workspace.ArrayType; import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr; import org.nd4j.autodiff.samediff.SDVariable; import org.nd4j.autodiff.samediff.SameDiff; +import org.nd4j.autodiff.samediff.internal.InferenceSession; +import org.nd4j.autodiff.samediff.internal.SessionMemMgr; import org.nd4j.base.Preconditions; import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.memory.MemoryWorkspace; +import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.impl.layers.ExternalErrorsFunction; import org.nd4j.linalg.dataset.api.DataSet; @@ -95,40 +98,59 @@ public class SameDiffOutputLayer extends AbstractLayer e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - - Map phMap = new HashMap<>(); - phMap.put(INPUT_KEY, input); - if(!activations && layerConf().labelsRequired() && labels != null) { - phMap.put(LABELS_KEY, labels); - } - - String s = activations ? layerConf().activationsVertexName() : outputVar.name(); - - INDArray out = sameDiff.outputSingle(phMap, s); - - //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere - sameDiff.clearPlaceholders(true); - sameDiff.clearOpInputs(); - - if(activations) { - Preconditions.checkNotNull(out, "Activations (result) array for variable \"%s\" was " + - "null - error during execution or this variable (as defined by method activationsVertexName()) " + - "does not exist", layerConf().activationsVertexName()); - return workspaceMgr.dup(ArrayType.ACTIVATIONS, out); - } else { - return out; - } } + + //Configure memory management for SameDiff instance - use DL4J workspaces + String wsNameWorking = workspaceMgr.getWorkspaceName(ArrayType.FF_WORKING_MEM); + String wsNameOutput = workspaceMgr.getWorkspaceName(ArrayType.ACTIVATIONS); + WorkspaceConfiguration confWorking = workspaceMgr.getConfiguration(ArrayType.FF_WORKING_MEM); + WorkspaceConfiguration confOutput = workspaceMgr.getConfiguration(ArrayType.ACTIVATIONS); + boolean actScopedOut = workspaceMgr.isScopedOut(ArrayType.ACTIVATIONS); + Preconditions.checkState(actScopedOut || wsNameOutput != null, "Activations must have a workspace or must be scoped out"); + SessionMemMgr mmgr = new DL4JSameDiffMemoryMgr(wsNameWorking, wsNameOutput, confWorking, confOutput); + + InferenceSession is = sameDiff.getSessions().get(Thread.currentThread().getId()); + if(is == null){ + is = new InferenceSession(sameDiff); + sameDiff.getSessions().put(Thread.currentThread().getId(), is); + } + is.setMmgr(mmgr); + + + + //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration + //TODO Find a more efficient solution for this + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + + Map phMap = new HashMap<>(); + phMap.put(INPUT_KEY, input); + if(!activations && layerConf().labelsRequired() && labels != null) { + phMap.put(LABELS_KEY, labels); + } + + String s = activations ? layerConf().activationsVertexName() : outputVar.name(); + + INDArray out = sameDiff.outputSingle(phMap, s); + + //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere + sameDiff.clearPlaceholders(true); + sameDiff.clearOpInputs(); + + //Edge case: vertex is just an Identity function, for example + //TODO there may be a cleaner way to do this... + if(!actScopedOut && !out.data().getParentWorkspace().getId().equals(wsNameOutput)){ + out = workspaceMgr.dup(ArrayType.ACTIVATIONS, out); + } else if(actScopedOut && out.isAttached()){ + out = out.detach(); + } + + return out; } @@ -141,54 +163,76 @@ public class SameDiffOutputLayer extends AbstractLayer e : paramTable.entrySet()) { - INDArray arr = e.getValue(); - sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); - } - - List gradVarNames = new ArrayList<>(); - gradVarNames.addAll(paramTable.keySet()); - gradVarNames.add(INPUT_KEY); - - Map phMap = new HashMap<>(); - phMap.put(INPUT_KEY, input); - phMap.put(LABELS_KEY, labels); - - Map grads = sameDiff.calculateGradients(phMap, gradVarNames); - for(String s : paramTable.keySet() ){ - INDArray sdGrad = grads.get(s); - INDArray dl4jGrad = gradTable.get(s); - dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS - g.gradientForVariable().put(s, dl4jGrad); - if(sdGrad.closeable()){ - sdGrad.close(); - } - } - - dLdIn = grads.get(INPUT_KEY); } + //Configure memory management for SameDiff instance - use DL4J workspaces + Map sessionMap = sameDiff.getFunction("grad").getSessions(); + if(!sessionMap.containsKey(Thread.currentThread().getId())){ + sessionMap.put(Thread.currentThread().getId(), new InferenceSession(sameDiff.getFunction("grad"))); + } + String wsNameWorking = workspaceMgr.getWorkspaceName(ArrayType.BP_WORKING_MEM); + String wsNameActGrad = workspaceMgr.getWorkspaceName(ArrayType.ACTIVATION_GRAD); + WorkspaceConfiguration confWorking = workspaceMgr.getConfiguration(ArrayType.BP_WORKING_MEM); + WorkspaceConfiguration confOutput = workspaceMgr.getConfiguration(ArrayType.ACTIVATION_GRAD); + + boolean actGradScopedOut = workspaceMgr.isScopedOut(ArrayType.ACTIVATION_GRAD); + Preconditions.checkState(actGradScopedOut || wsNameActGrad != null, "Activation gradients must have a workspace or be scoped out"); + SessionMemMgr mmgr = new DL4JSameDiffMemoryMgr(wsNameWorking, wsNameActGrad, confWorking, confOutput); + sessionMap.get(Thread.currentThread().getId()).setMmgr(mmgr); + + if(!sameDiff.hasGradientFunction()) { + //Create when scoped out, to ensure any arrays are not in WS + sameDiff.createGradFunction(INPUT_KEY); + } + + //Because DL4J parameters are views, and SameDiff uses DeviceLocal (which doesn't support views), we need to update the arrays on each iteration + //TODO Find a more efficient solution for this + for (Map.Entry e : paramTable.entrySet()) { + INDArray arr = e.getValue(); + sameDiff.assignArray(arr, sameDiff.getVariable(e.getKey())); + } + + List gradVarNames = new ArrayList<>(); + gradVarNames.addAll(paramTable.keySet()); + gradVarNames.add(INPUT_KEY); + + Map phMap = new HashMap<>(); + phMap.put(INPUT_KEY, input); + phMap.put(LABELS_KEY, labels); + + Map grads = sameDiff.calculateGradients(phMap, gradVarNames); + for(String s : paramTable.keySet() ){ + INDArray sdGrad = grads.get(s); + INDArray dl4jGrad = gradTable.get(s); + dl4jGrad.assign(sdGrad); //TODO OPTIMIZE THIS + g.gradientForVariable().put(s, dl4jGrad); + if(sdGrad.closeable()){ + sdGrad.close(); + } + } + + dLdIn = grads.get(INPUT_KEY); + //Clear placeholders and op inputs to ensure no out-of-scope arrays are still referenced anywhere sameDiff.clearPlaceholders(true); sameDiff.clearOpInputs(); - Pair p = new Pair<>(g, workspaceMgr.dup(ArrayType.ACTIVATION_GRAD, dLdIn)); //TODO OPTIMIZE THIS - if(dLdIn.closeable()) - dLdIn.close(); - return p; + //TODO there may be a cleaner way to do this... + if(!actGradScopedOut && !dLdIn.data().getParentWorkspace().getId().equals(wsNameActGrad)){ + dLdIn = workspaceMgr.dup(ArrayType.ACTIVATION_GRAD, dLdIn); + } else if(actGradScopedOut && dLdIn.isAttached()){ + dLdIn = dLdIn.detach(); + } + + return new Pair<>(g, dLdIn); } /**Returns the parameters of the neural network as a flattened row vector @@ -312,7 +356,8 @@ public class SameDiffOutputLayer extends AbstractLayer