MLN/CG: Don't swallow exceptions if a second exception occurs during workspace closing (#161)

Signed-off-by: AlexDBlack <blacka101@gmail.com>
2019-08-24 17:33:11 +10:00 · 2019-08-24 17:33:11 +10:00 · b85238a6df
commit b85238a6df
parent f8364997c0
2 changed files with 121 additions and 63 deletions
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
@ -2278,6 +2278,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
        LayerWorkspaceMgr allNone = noWS ? LayerWorkspaceMgr.noWorkspaces(helperWorkspaces) : null;
        List<MemoryWorkspace>[] closeAtEndIteraton = (List<MemoryWorkspace>[])new List[topologicalOrder.length];
        MemoryWorkspace initialWorkspace = Nd4j.getMemoryManager().getCurrentWorkspace();
+        Throwable t = null;
        try {
            for (int i = 0; i <= stopIndex; i++) {
                GraphVertex current = vertices[topologicalOrder[i]];
@ -2436,6 +2437,8 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                    }
                }
            }
+        } catch (Throwable t2){
+            t = t2;
        } finally {
            //Close all open workspaces... usually this list will be empty, but not if an exception is thrown
            //Though if stopIndex < numLayers, some might still be open
@ -2444,7 +2447,15 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                    //Edge case here: seems that scoping out can increase the tagScope of the current WS
                    //and if we hit an exception during forward pass, we aren't guaranteed to call close a sufficient
                    // number of times to actually close it, in all cases
+                    try{
                        ws.close();
+                    } catch (Throwable t2){
+                        if(t != null){
+                            log.error("Encountered second exception while trying to close workspace after initial exception");
+                            log.error("Original exception:", t);
+                            throw t2;
+                        }
+                    }
                }
            }
            Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
@ -2581,6 +2592,7 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {

        boolean traceLog = log.isTraceEnabled();

+        Throwable t = null;
        try {
            for (int i = topologicalOrder.length - 1; i >= 0; i--) {
                boolean hitFrozen = false;
@ -2732,8 +2744,8 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                        tempList.addFirst(new Triple<>(newName, entry.getValue(),
                                g.flatteningOrderForVariable(origName)));
                    }
-                    for (Triple<String, INDArray, Character> t : tempList)
-                        gradients.addFirst(t);
+                    for (Triple<String, INDArray, Character> triple : tempList)
+                        gradients.addFirst(triple);
                }

                //Close any activation gradient workspaces that we no longer require
@ -2752,19 +2764,28 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                    log.trace("Completed backprop: {} (\"{}\") - {}", i, vertexName, current.getClass().getSimpleName());
                }
            }
-
+        } catch (Throwable t2){
+            t = t2;
        } finally {
            //Close all open workspaces... usually this list will be empty, but not if an exception is thrown
            for(MemoryWorkspace ws : openActivationsWorkspaces.keySet()){
+                try{
                    ws.close();
+                } catch (Throwable t2){
+                    if(t != null){
+                        log.error("Encountered second exception while trying to close workspace after initial exception");
+                        log.error("Original exception:", t);
+                        throw t2;
+                    }
+                }
            }
            Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
        }

        //Now, add the gradients in the order we need them in for flattening (same as params order)
        Gradient gradient = new DefaultGradient(flattenedGradients);
-        for (Triple<String, INDArray, Character> t : gradients) {
-            gradient.setGradientFor(t.getFirst(), t.getSecond(), t.getThird());
+        for (Triple<String, INDArray, Character> tr : gradients) {
+            gradient.setGradientFor(tr.getFirst(), tr.getSecond(), tr.getThird());
        }

        this.gradient = gradient;
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/multilayer/MultiLayerNetwork.java
@ -1242,6 +1242,7 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura

        boolean traceLog = log.isTraceEnabled();

+        Throwable t = null;
        try {
            for (int i = 0; i <= layerIndex; i++) {
                LayerWorkspaceMgr mgr = (i % 2 == 0 ? mgrEven : mgrOdd);
@ -1328,17 +1329,34 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura
                    mgr.setWorkspace(ArrayType.INPUT, WS_LAYER_ACT_2, WS_LAYER_ACT_X_CONFIG);            //Inputs should always be in the previous WS
                }
            }
-
+        } catch (Throwable t2){
+            t = t2;
        } finally {
            if(wsActCloseNext != null){
+                try {
                    wsActCloseNext.close();
+                } catch (Throwable t2){
+                    if(t != null){
+                        log.error("Encountered second exception while trying to close workspace after initial exception");
+                        log.error("Original exception:", t);
+                        throw t2;
+                    }
+                }
            }
            if(temp != null){
                //Should only be non-null on exception
                while(temp.isScopeActive()){
                    //For safety, should never occur in theory: a single close() call may not be sufficient, if
                    // workspace scope was borrowed and not properly closed when exception occurred
+                    try{
                        temp.close();
+                    } catch (Throwable t2){
+                        if(t != null){
+                            log.error("Encountered second exception while trying to close workspace after initial exception");
+                            log.error("Original exception:", t);
+                            throw t2;
+                        }
+                    }
                }
            }

@ -1871,6 +1889,7 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura

        boolean traceLog = log.isTraceEnabled();

+        Throwable t = null;
        try {
            for (int i = layers.length - 1; i >= 0; i--) {
                if (layers[i] instanceof FrozenLayer) {
@ -1961,13 +1980,31 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura
                    log.trace("Completed backprop: {} - {}", i, layers[i].getClass().getSimpleName());
                }
            }
+        } catch (Throwable thr ){
+            t = thr;
        } finally {
            if(wsActGradCloseNext != null){
+                try {
                    wsActGradCloseNext.close();
+                } catch (Throwable t2){
+                    if(t != null){
+                        log.error("Encountered second exception while trying to close workspace after initial exception");
+                        log.error("Original exception:", t);
+                        throw t2;
+                    }
+                }
            }
            if(wsActGradTemp != null) {
                //Should only be non-null on exception
+                try {
                    wsActGradTemp.close();
+                } catch (Throwable t2) {
+                    if (t != null) {
+                        log.error("Encountered second exception while trying to close workspace after initial exception");
+                        log.error("Original exception:", t);
+                        throw t2;
+                    }
+                }
            }
            Nd4j.getMemoryManager().setCurrentWorkspace(initialWorkspace);
        }