[WIP] Few more pre-release fixes (#461)
* error code check in CudaMemoryManager Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear model before exiting Signed-off-by: raver119@gmail.com <raver119@gmail.com> * MultiLayerNetwork/ComputationGraph.close() [WIP] (#460) * MultiLayerNetwork/ComputationGraph.close() Signed-off-by: Alex Black <blacka101@gmail.com> * Copyright header Signed-off-by: Alex Black <blacka101@gmail.com> * Fix Signed-off-by: Alex Black <blacka101@gmail.com> * - fix for handling release of nested DataBuffers - couple of additional tests for released DataBuffers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * PW test: increase number of epochs slightly Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: Alex Black <blacka101@gmail.com>
This commit is contained in:
		
							parent
							
								
									1ce65fced4
								
							
						
					
					
						commit
						60f103fb03
					
				| @ -0,0 +1,151 @@ | ||||
| /* ****************************************************************************** | ||||
|  * Copyright (c) 2020 Konduit K.K. | ||||
|  * | ||||
|  * This program and the accompanying materials are made available under the | ||||
|  * terms of the Apache License, Version 2.0 which is available at | ||||
|  * https://www.apache.org/licenses/LICENSE-2.0. | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||||
|  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||||
|  * License for the specific language governing permissions and limitations | ||||
|  * under the License. | ||||
|  * | ||||
|  * SPDX-License-Identifier: Apache-2.0 | ||||
|  ******************************************************************************/ | ||||
| package org.deeplearning4j.nn.misc; | ||||
| 
 | ||||
| import org.deeplearning4j.BaseDL4JTest; | ||||
| import org.deeplearning4j.TestUtils; | ||||
| import org.deeplearning4j.nn.api.Updater; | ||||
| import org.deeplearning4j.nn.conf.MultiLayerConfiguration; | ||||
| import org.deeplearning4j.nn.conf.NeuralNetConfiguration; | ||||
| import org.deeplearning4j.nn.conf.inputs.InputType; | ||||
| import org.deeplearning4j.nn.conf.layers.*; | ||||
| import org.deeplearning4j.nn.graph.ComputationGraph; | ||||
| import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; | ||||
| import org.junit.Test; | ||||
| import org.nd4j.linalg.activations.Activation; | ||||
| import org.nd4j.linalg.api.buffer.DataType; | ||||
| import org.nd4j.linalg.api.ndarray.INDArray; | ||||
| import org.nd4j.linalg.factory.Nd4j; | ||||
| import org.nd4j.linalg.learning.config.Adam; | ||||
| 
 | ||||
| import static org.junit.Assert.assertTrue; | ||||
| 
 | ||||
| public class CloseNetworkTests extends BaseDL4JTest { | ||||
| 
 | ||||
|     public static MultiLayerNetwork getTestNet() { | ||||
|         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() | ||||
|                 .updater(new Adam(1e-3)) | ||||
|                 .list() | ||||
|                 .layer(new ConvolutionLayer.Builder().nOut(5).kernelSize(3, 3).activation(Activation.TANH).build()) | ||||
|                 .layer(new BatchNormalization.Builder().nOut(5).build()) | ||||
|                 .layer(new SubsamplingLayer.Builder().build()) | ||||
|                 .layer(new DenseLayer.Builder().nOut(10).activation(Activation.RELU).build()) | ||||
|                 .layer(new OutputLayer.Builder().nOut(10).build()) | ||||
|                 .setInputType(InputType.convolutional(28, 28, 1)) | ||||
|                 .build(); | ||||
| 
 | ||||
|         MultiLayerNetwork net = new MultiLayerNetwork(conf); | ||||
|         net.init(); | ||||
| 
 | ||||
|         return net; | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testCloseMLN() { | ||||
|         for (boolean train : new boolean[]{false, true}) { | ||||
|             for (boolean test : new boolean[]{false, true}) { | ||||
|                 MultiLayerNetwork net = getTestNet(); | ||||
| 
 | ||||
|                 INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28); | ||||
|                 INDArray l = TestUtils.randomOneHot(16, 10); | ||||
| 
 | ||||
|                 if (train) { | ||||
|                     for (int i = 0; i < 3; i++) { | ||||
|                         net.fit(f, l); | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 if (test) { | ||||
|                     for (int i = 0; i < 3; i++) { | ||||
|                         net.output(f); | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 net.close(); | ||||
| 
 | ||||
|                 assertTrue(net.params().wasClosed()); | ||||
|                 if(train) { | ||||
|                     assertTrue(net.getGradientsViewArray().wasClosed()); | ||||
|                     Updater u = net.getUpdater(false); | ||||
|                     assertTrue(u.getStateViewArray().wasClosed()); | ||||
|                 } | ||||
| 
 | ||||
|                 //Make sure we don't get crashes etc when trying to use after closing | ||||
|                 try { | ||||
|                     net.output(f); | ||||
|                 } catch (IllegalStateException e) { | ||||
|                     String msg = e.getMessage(); | ||||
|                     assertTrue(msg, msg.contains("released")); | ||||
|                 } | ||||
| 
 | ||||
|                 try { | ||||
|                     net.fit(f, l); | ||||
|                 } catch (IllegalStateException e) { | ||||
|                     String msg = e.getMessage(); | ||||
|                     assertTrue(msg, msg.contains("released")); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testCloseCG() { | ||||
|         for (boolean train : new boolean[]{false, true}) { | ||||
|             for (boolean test : new boolean[]{false, true}) { | ||||
|                 ComputationGraph net = getTestNet().toComputationGraph(); | ||||
| 
 | ||||
|                 INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28); | ||||
|                 INDArray l = TestUtils.randomOneHot(16, 10); | ||||
| 
 | ||||
|                 if (train) { | ||||
|                     for (int i = 0; i < 3; i++) { | ||||
|                         net.fit(new INDArray[]{f}, new INDArray[]{l}); | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 if (test) { | ||||
|                     for (int i = 0; i < 3; i++) { | ||||
|                         net.output(f); | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 net.close(); | ||||
| 
 | ||||
|                 assertTrue(net.params().wasClosed()); | ||||
|                 if(train) { | ||||
|                     assertTrue(net.getGradientsViewArray().wasClosed()); | ||||
|                     Updater u = net.getUpdater(false); | ||||
|                     assertTrue(u.getStateViewArray().wasClosed()); | ||||
|                 } | ||||
| 
 | ||||
|                 //Make sure we don't get crashes etc when trying to use after closing | ||||
|                 try { | ||||
|                     net.output(f); | ||||
|                 } catch (IllegalStateException e) { | ||||
|                     String msg = e.getMessage(); | ||||
|                     assertTrue(msg, msg.contains("released")); | ||||
|                 } | ||||
| 
 | ||||
|                 try { | ||||
|                     net.fit(new INDArray[]{f}, new INDArray[]{l}); | ||||
|                 } catch (IllegalStateException e) { | ||||
|                     String msg = e.getMessage(); | ||||
|                     assertTrue(msg, msg.contains("released")); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @ -1035,5 +1035,9 @@ public class TestOptimizers extends BaseDL4JTest { | ||||
|         public boolean updaterDivideByMinibatch(String paramName) { | ||||
|             return true; | ||||
|         } | ||||
| 
 | ||||
|         @Override | ||||
|         public void close(){ | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -1055,4 +1055,9 @@ public class BarnesHutTsne implements Model { | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //No-op | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -0,0 +1,4 @@ | ||||
| package org.deeplearning4j.nn.modelimport.keras; | ||||
| 
 | ||||
| public class Temp { | ||||
| } | ||||
| @ -233,4 +233,7 @@ public interface Model { | ||||
|      * Apply any constraints to the model | ||||
|      */ | ||||
|     void applyConstraints(int iteration, int epoch); | ||||
| 
 | ||||
| 
 | ||||
|     void close(); | ||||
| } | ||||
|  | ||||
| @ -4824,4 +4824,28 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork { | ||||
|         if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null) | ||||
|             this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray()); | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
|      * Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces | ||||
|      * Note that the network should not be used again for any purpose after it has been closed | ||||
|      */ | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //Close the INDArray and dealloc | ||||
|         if(flattenedParams.closeable()) | ||||
|             flattenedParams.close(); | ||||
| 
 | ||||
|         if(flattenedGradients != null && flattenedGradients.closeable()) | ||||
|             flattenedGradients.close(); | ||||
| 
 | ||||
|         Updater u = getUpdater(false); | ||||
|         if(u != null && u.getStateViewArray() != null) { | ||||
|             INDArray state = u.getStateViewArray(); | ||||
|             if(state.closeable()) | ||||
|                 state.close(); | ||||
|         } | ||||
| 
 | ||||
|         Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread(); | ||||
|         System.gc(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -428,4 +428,9 @@ public abstract class AbstractLayer<LayerConfT extends org.deeplearning4j.nn.con | ||||
|         //Majority of params's gradients should be... Exception: batch norm mean/variance estimate | ||||
|         return true; | ||||
|     } | ||||
| 
 | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //No-op for individual layers | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -599,4 +599,9 @@ public class BidirectionalLayer implements RecurrentLayer { | ||||
|             return ret; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //No-op for individual layers | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -1144,4 +1144,9 @@ public class VariationalAutoencoder implements Layer { | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //No-op for individual layers | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -329,4 +329,9 @@ public abstract class BaseWrapperLayer implements Layer { | ||||
|     public boolean updaterDivideByMinibatch(String paramName) { | ||||
|         return underlying.updaterDivideByMinibatch(paramName); | ||||
|     } | ||||
| 
 | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //No-op for individual layers | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -4085,4 +4085,27 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura | ||||
|             this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray()); | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
|      * Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces | ||||
|      * Note that the network should not be used again for any purpose after it has been closed | ||||
|      */ | ||||
|     @Override | ||||
|     public void close(){ | ||||
|         //Close the INDArray and dealloc | ||||
|         if(flattenedParams.closeable()) | ||||
|             flattenedParams.close(); | ||||
| 
 | ||||
|         if(flattenedGradients != null && flattenedGradients.closeable()) | ||||
|             flattenedGradients.close(); | ||||
| 
 | ||||
|         Updater u = getUpdater(false); | ||||
|         if(u != null && u.getStateViewArray() != null) { | ||||
|             INDArray state = u.getStateViewArray(); | ||||
|             if(state.closeable()) | ||||
|                 state.close(); | ||||
|         } | ||||
| 
 | ||||
|         Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread(); | ||||
|         System.gc(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| @ -450,6 +450,14 @@ public class DefaultTrainer extends Thread implements Trainer { | ||||
|         } finally { | ||||
|             log.debug("Terminating all workspaces for trainer_{}", threadId); | ||||
|             Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread(); | ||||
| 
 | ||||
|             if (!onRootModel) { | ||||
|                 replicatedModel.close(); | ||||
|             } | ||||
| 
 | ||||
|             // let's try to enforce GC to actually clean all references now | ||||
|             replicatedModel.clear(); | ||||
|             System.gc(); | ||||
|             isStopped.set(true); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @ -58,7 +58,7 @@ public class ParallelWrapperTest extends BaseDL4JTest { | ||||
| 
 | ||||
|         // for GPU you usually want to have higher batchSize | ||||
|         int batchSize = 128; | ||||
|         int nEpochs = 2; | ||||
|         int nEpochs = 5; | ||||
|         int seed = 123; | ||||
| 
 | ||||
|         log.info("Load data...."); | ||||
|  | ||||
| @ -1957,6 +1957,9 @@ public abstract class BaseDataBuffer implements DataBuffer { | ||||
| 
 | ||||
|     @Override | ||||
|     public boolean wasClosed() { | ||||
|         if (wrappedDataBuffer != null && wrappedDataBuffer != this) | ||||
|             return wrappedDataBuffer.wasClosed(); | ||||
| 
 | ||||
|         return released; | ||||
|     } | ||||
| 
 | ||||
|  | ||||
| @ -71,7 +71,13 @@ public class CudaMemoryManager extends BasicMemoryManager { | ||||
|             return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer(); | ||||
|         } else if (kind == MemoryKind.DEVICE) { | ||||
|             val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0); | ||||
|             //log.info("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread()); | ||||
|             log.trace("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread()); | ||||
| 
 | ||||
|             val ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode(); | ||||
|             if (ec != 0) { | ||||
|                 val em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); | ||||
|                 throw new RuntimeException(em + "; Bytes: [" + bytes + "]; Error code [" + ec + "]; DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "]"); | ||||
|             } | ||||
| 
 | ||||
|             if (ptr == null) | ||||
|                 throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory"); | ||||
|  | ||||
| @ -85,6 +85,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda | ||||
|     } | ||||
| 
 | ||||
|     public OpaqueDataBuffer getOpaqueDataBuffer() { | ||||
|         if (released) | ||||
|             throw new IllegalStateException("You can't use DataBuffer once it was released"); | ||||
| 
 | ||||
|         return ptrDataBuffer; | ||||
|     } | ||||
| 
 | ||||
| @ -104,7 +107,8 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda | ||||
|         ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type,  pointer, specialPointer); | ||||
|         this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length); | ||||
| 
 | ||||
|         Nd4j.getDeallocatorService().pickObject(this); | ||||
|         Nd4j.getDeallocatorService().pickObject(this);if (released) | ||||
|             throw new IllegalStateException("You can't use DataBuffer once it was released"); | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
| @ -473,6 +477,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda | ||||
|     } | ||||
| 
 | ||||
|     public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) { | ||||
|         if (underlyingBuffer.wasClosed()) | ||||
|             throw new IllegalStateException("You can't use DataBuffer once it was released"); | ||||
| 
 | ||||
|         //this(length, underlyingBuffer.getElementSize(), offset); | ||||
|         this.allocationMode = AllocationMode.MIXED_DATA_TYPES; | ||||
|         initTypeAndSize(); | ||||
|  | ||||
| @ -198,4 +198,27 @@ public class BaseCudaDataBufferTest extends BaseND4JTest { | ||||
|         // there shoul dbe no exceptions during execution | ||||
|         assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get()); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testClose_1() { | ||||
|         val x = Nd4j.createFromArray(1, 2, 3); | ||||
| 
 | ||||
|         x.close(); | ||||
| 
 | ||||
|         assertTrue(x.wasClosed()); | ||||
|         assertTrue(x.data().wasClosed()); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testClose_2() { | ||||
|         val x = Nd4j.create(DataType.FLOAT, 5, 6); | ||||
|         val row = x.getRow(1); | ||||
|         x.close(); | ||||
| 
 | ||||
|         assertTrue(x.wasClosed()); | ||||
|         assertTrue(x.data().wasClosed()); | ||||
| 
 | ||||
|         assertTrue(row.wasClosed()); | ||||
|         assertTrue(row.data().wasClosed()); | ||||
|     } | ||||
| } | ||||
| @ -61,6 +61,9 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo | ||||
|     } | ||||
| 
 | ||||
|     public OpaqueDataBuffer getOpaqueDataBuffer() { | ||||
|         if (released) | ||||
|             throw new IllegalStateException("You can't use DataBuffer once it was released"); | ||||
| 
 | ||||
|         return ptrDataBuffer; | ||||
|     } | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user