[WIP] Few more pre-release fixes (#461)
* error code check in CudaMemoryManager Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear model before exiting Signed-off-by: raver119@gmail.com <raver119@gmail.com> * MultiLayerNetwork/ComputationGraph.close() [WIP] (#460) * MultiLayerNetwork/ComputationGraph.close() Signed-off-by: Alex Black <blacka101@gmail.com> * Copyright header Signed-off-by: Alex Black <blacka101@gmail.com> * Fix Signed-off-by: Alex Black <blacka101@gmail.com> * - fix for handling release of nested DataBuffers - couple of additional tests for released DataBuffers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * PW test: increase number of epochs slightly Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: Alex Black <blacka101@gmail.com>master
parent
1ce65fced4
commit
60f103fb03
|
@ -0,0 +1,151 @@
|
|||
/* ******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
package org.deeplearning4j.nn.misc;
|
||||
|
||||
import org.deeplearning4j.BaseDL4JTest;
|
||||
import org.deeplearning4j.TestUtils;
|
||||
import org.deeplearning4j.nn.api.Updater;
|
||||
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
|
||||
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||
import org.deeplearning4j.nn.conf.inputs.InputType;
|
||||
import org.deeplearning4j.nn.conf.layers.*;
|
||||
import org.deeplearning4j.nn.graph.ComputationGraph;
|
||||
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
|
||||
import org.junit.Test;
|
||||
import org.nd4j.linalg.activations.Activation;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.learning.config.Adam;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class CloseNetworkTests extends BaseDL4JTest {
|
||||
|
||||
public static MultiLayerNetwork getTestNet() {
|
||||
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
|
||||
.updater(new Adam(1e-3))
|
||||
.list()
|
||||
.layer(new ConvolutionLayer.Builder().nOut(5).kernelSize(3, 3).activation(Activation.TANH).build())
|
||||
.layer(new BatchNormalization.Builder().nOut(5).build())
|
||||
.layer(new SubsamplingLayer.Builder().build())
|
||||
.layer(new DenseLayer.Builder().nOut(10).activation(Activation.RELU).build())
|
||||
.layer(new OutputLayer.Builder().nOut(10).build())
|
||||
.setInputType(InputType.convolutional(28, 28, 1))
|
||||
.build();
|
||||
|
||||
MultiLayerNetwork net = new MultiLayerNetwork(conf);
|
||||
net.init();
|
||||
|
||||
return net;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCloseMLN() {
|
||||
for (boolean train : new boolean[]{false, true}) {
|
||||
for (boolean test : new boolean[]{false, true}) {
|
||||
MultiLayerNetwork net = getTestNet();
|
||||
|
||||
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
|
||||
INDArray l = TestUtils.randomOneHot(16, 10);
|
||||
|
||||
if (train) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
net.fit(f, l);
|
||||
}
|
||||
}
|
||||
|
||||
if (test) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
net.output(f);
|
||||
}
|
||||
}
|
||||
|
||||
net.close();
|
||||
|
||||
assertTrue(net.params().wasClosed());
|
||||
if(train) {
|
||||
assertTrue(net.getGradientsViewArray().wasClosed());
|
||||
Updater u = net.getUpdater(false);
|
||||
assertTrue(u.getStateViewArray().wasClosed());
|
||||
}
|
||||
|
||||
//Make sure we don't get crashes etc when trying to use after closing
|
||||
try {
|
||||
net.output(f);
|
||||
} catch (IllegalStateException e) {
|
||||
String msg = e.getMessage();
|
||||
assertTrue(msg, msg.contains("released"));
|
||||
}
|
||||
|
||||
try {
|
||||
net.fit(f, l);
|
||||
} catch (IllegalStateException e) {
|
||||
String msg = e.getMessage();
|
||||
assertTrue(msg, msg.contains("released"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCloseCG() {
|
||||
for (boolean train : new boolean[]{false, true}) {
|
||||
for (boolean test : new boolean[]{false, true}) {
|
||||
ComputationGraph net = getTestNet().toComputationGraph();
|
||||
|
||||
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
|
||||
INDArray l = TestUtils.randomOneHot(16, 10);
|
||||
|
||||
if (train) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
net.fit(new INDArray[]{f}, new INDArray[]{l});
|
||||
}
|
||||
}
|
||||
|
||||
if (test) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
net.output(f);
|
||||
}
|
||||
}
|
||||
|
||||
net.close();
|
||||
|
||||
assertTrue(net.params().wasClosed());
|
||||
if(train) {
|
||||
assertTrue(net.getGradientsViewArray().wasClosed());
|
||||
Updater u = net.getUpdater(false);
|
||||
assertTrue(u.getStateViewArray().wasClosed());
|
||||
}
|
||||
|
||||
//Make sure we don't get crashes etc when trying to use after closing
|
||||
try {
|
||||
net.output(f);
|
||||
} catch (IllegalStateException e) {
|
||||
String msg = e.getMessage();
|
||||
assertTrue(msg, msg.contains("released"));
|
||||
}
|
||||
|
||||
try {
|
||||
net.fit(new INDArray[]{f}, new INDArray[]{l});
|
||||
} catch (IllegalStateException e) {
|
||||
String msg = e.getMessage();
|
||||
assertTrue(msg, msg.contains("released"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1035,5 +1035,9 @@ public class TestOptimizers extends BaseDL4JTest {
|
|||
public boolean updaterDivideByMinibatch(String paramName) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1055,4 +1055,9 @@ public class BarnesHutTsne implements Model {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
//No-op
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
package org.deeplearning4j.nn.modelimport.keras;
|
||||
|
||||
public class Temp {
|
||||
}
|
|
@ -233,4 +233,7 @@ public interface Model {
|
|||
* Apply any constraints to the model
|
||||
*/
|
||||
void applyConstraints(int iteration, int epoch);
|
||||
|
||||
|
||||
void close();
|
||||
}
|
||||
|
|
|
@ -4824,4 +4824,28 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
|
|||
if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null)
|
||||
this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray());
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
|
||||
* Note that the network should not be used again for any purpose after it has been closed
|
||||
*/
|
||||
@Override
|
||||
public void close(){
|
||||
//Close the INDArray and dealloc
|
||||
if(flattenedParams.closeable())
|
||||
flattenedParams.close();
|
||||
|
||||
if(flattenedGradients != null && flattenedGradients.closeable())
|
||||
flattenedGradients.close();
|
||||
|
||||
Updater u = getUpdater(false);
|
||||
if(u != null && u.getStateViewArray() != null) {
|
||||
INDArray state = u.getStateViewArray();
|
||||
if(state.closeable())
|
||||
state.close();
|
||||
}
|
||||
|
||||
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||
System.gc();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -428,4 +428,9 @@ public abstract class AbstractLayer<LayerConfT extends org.deeplearning4j.nn.con
|
|||
//Majority of params's gradients should be... Exception: batch norm mean/variance estimate
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
//No-op for individual layers
|
||||
}
|
||||
}
|
||||
|
|
|
@ -599,4 +599,9 @@ public class BidirectionalLayer implements RecurrentLayer {
|
|||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
//No-op for individual layers
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1144,4 +1144,9 @@ public class VariationalAutoencoder implements Layer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
//No-op for individual layers
|
||||
}
|
||||
}
|
||||
|
|
|
@ -329,4 +329,9 @@ public abstract class BaseWrapperLayer implements Layer {
|
|||
public boolean updaterDivideByMinibatch(String paramName) {
|
||||
return underlying.updaterDivideByMinibatch(paramName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(){
|
||||
//No-op for individual layers
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4085,4 +4085,27 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura
|
|||
this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
|
||||
* Note that the network should not be used again for any purpose after it has been closed
|
||||
*/
|
||||
@Override
|
||||
public void close(){
|
||||
//Close the INDArray and dealloc
|
||||
if(flattenedParams.closeable())
|
||||
flattenedParams.close();
|
||||
|
||||
if(flattenedGradients != null && flattenedGradients.closeable())
|
||||
flattenedGradients.close();
|
||||
|
||||
Updater u = getUpdater(false);
|
||||
if(u != null && u.getStateViewArray() != null) {
|
||||
INDArray state = u.getStateViewArray();
|
||||
if(state.closeable())
|
||||
state.close();
|
||||
}
|
||||
|
||||
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||
System.gc();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -450,6 +450,14 @@ public class DefaultTrainer extends Thread implements Trainer {
|
|||
} finally {
|
||||
log.debug("Terminating all workspaces for trainer_{}", threadId);
|
||||
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||
|
||||
if (!onRootModel) {
|
||||
replicatedModel.close();
|
||||
}
|
||||
|
||||
// let's try to enforce GC to actually clean all references now
|
||||
replicatedModel.clear();
|
||||
System.gc();
|
||||
isStopped.set(true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ public class ParallelWrapperTest extends BaseDL4JTest {
|
|||
|
||||
// for GPU you usually want to have higher batchSize
|
||||
int batchSize = 128;
|
||||
int nEpochs = 2;
|
||||
int nEpochs = 5;
|
||||
int seed = 123;
|
||||
|
||||
log.info("Load data....");
|
||||
|
|
|
@ -1957,6 +1957,9 @@ public abstract class BaseDataBuffer implements DataBuffer {
|
|||
|
||||
@Override
|
||||
public boolean wasClosed() {
|
||||
if (wrappedDataBuffer != null && wrappedDataBuffer != this)
|
||||
return wrappedDataBuffer.wasClosed();
|
||||
|
||||
return released;
|
||||
}
|
||||
|
||||
|
|
|
@ -71,7 +71,13 @@ public class CudaMemoryManager extends BasicMemoryManager {
|
|||
return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer();
|
||||
} else if (kind == MemoryKind.DEVICE) {
|
||||
val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0);
|
||||
//log.info("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread());
|
||||
log.trace("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread());
|
||||
|
||||
val ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode();
|
||||
if (ec != 0) {
|
||||
val em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage();
|
||||
throw new RuntimeException(em + "; Bytes: [" + bytes + "]; Error code [" + ec + "]; DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "]");
|
||||
}
|
||||
|
||||
if (ptr == null)
|
||||
throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory");
|
||||
|
|
|
@ -85,6 +85,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
|||
}
|
||||
|
||||
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
||||
if (released)
|
||||
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||
|
||||
return ptrDataBuffer;
|
||||
}
|
||||
|
||||
|
@ -104,7 +107,8 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
|||
ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type, pointer, specialPointer);
|
||||
this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length);
|
||||
|
||||
Nd4j.getDeallocatorService().pickObject(this);
|
||||
Nd4j.getDeallocatorService().pickObject(this);if (released)
|
||||
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -473,6 +477,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
|||
}
|
||||
|
||||
public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) {
|
||||
if (underlyingBuffer.wasClosed())
|
||||
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||
|
||||
//this(length, underlyingBuffer.getElementSize(), offset);
|
||||
this.allocationMode = AllocationMode.MIXED_DATA_TYPES;
|
||||
initTypeAndSize();
|
||||
|
|
|
@ -198,4 +198,27 @@ public class BaseCudaDataBufferTest extends BaseND4JTest {
|
|||
// there shoul dbe no exceptions during execution
|
||||
assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClose_1() {
|
||||
val x = Nd4j.createFromArray(1, 2, 3);
|
||||
|
||||
x.close();
|
||||
|
||||
assertTrue(x.wasClosed());
|
||||
assertTrue(x.data().wasClosed());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClose_2() {
|
||||
val x = Nd4j.create(DataType.FLOAT, 5, 6);
|
||||
val row = x.getRow(1);
|
||||
x.close();
|
||||
|
||||
assertTrue(x.wasClosed());
|
||||
assertTrue(x.data().wasClosed());
|
||||
|
||||
assertTrue(row.wasClosed());
|
||||
assertTrue(row.data().wasClosed());
|
||||
}
|
||||
}
|
|
@ -61,6 +61,9 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo
|
|||
}
|
||||
|
||||
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
||||
if (released)
|
||||
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||
|
||||
return ptrDataBuffer;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue