[WIP] Few more pre-release fixes (#461)

* error code check in CudaMemoryManager

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* clear

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* clear model before exiting

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* MultiLayerNetwork/ComputationGraph.close() [WIP] (#460)

* MultiLayerNetwork/ComputationGraph.close()

Signed-off-by: Alex Black <blacka101@gmail.com>

* Copyright header

Signed-off-by: Alex Black <blacka101@gmail.com>

* Fix

Signed-off-by: Alex Black <blacka101@gmail.com>

* - fix for handling release of nested DataBuffers
- couple of additional tests for released DataBuffers

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* PW test: increase number of epochs slightly

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

Co-authored-by: Alex Black <blacka101@gmail.com>
master
raver119 2020-05-13 16:00:54 +03:00 committed by GitHub
parent 1ce65fced4
commit 60f103fb03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 287 additions and 3 deletions

View File

@ -0,0 +1,151 @@
/* ******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.deeplearning4j.nn.misc;
import org.deeplearning4j.BaseDL4JTest;
import org.deeplearning4j.TestUtils;
import org.deeplearning4j.nn.api.Updater;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.*;
import org.deeplearning4j.nn.graph.ComputationGraph;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.junit.Test;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.learning.config.Adam;
import static org.junit.Assert.assertTrue;
public class CloseNetworkTests extends BaseDL4JTest {
public static MultiLayerNetwork getTestNet() {
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
.updater(new Adam(1e-3))
.list()
.layer(new ConvolutionLayer.Builder().nOut(5).kernelSize(3, 3).activation(Activation.TANH).build())
.layer(new BatchNormalization.Builder().nOut(5).build())
.layer(new SubsamplingLayer.Builder().build())
.layer(new DenseLayer.Builder().nOut(10).activation(Activation.RELU).build())
.layer(new OutputLayer.Builder().nOut(10).build())
.setInputType(InputType.convolutional(28, 28, 1))
.build();
MultiLayerNetwork net = new MultiLayerNetwork(conf);
net.init();
return net;
}
@Test
public void testCloseMLN() {
for (boolean train : new boolean[]{false, true}) {
for (boolean test : new boolean[]{false, true}) {
MultiLayerNetwork net = getTestNet();
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
INDArray l = TestUtils.randomOneHot(16, 10);
if (train) {
for (int i = 0; i < 3; i++) {
net.fit(f, l);
}
}
if (test) {
for (int i = 0; i < 3; i++) {
net.output(f);
}
}
net.close();
assertTrue(net.params().wasClosed());
if(train) {
assertTrue(net.getGradientsViewArray().wasClosed());
Updater u = net.getUpdater(false);
assertTrue(u.getStateViewArray().wasClosed());
}
//Make sure we don't get crashes etc when trying to use after closing
try {
net.output(f);
} catch (IllegalStateException e) {
String msg = e.getMessage();
assertTrue(msg, msg.contains("released"));
}
try {
net.fit(f, l);
} catch (IllegalStateException e) {
String msg = e.getMessage();
assertTrue(msg, msg.contains("released"));
}
}
}
}
@Test
public void testCloseCG() {
for (boolean train : new boolean[]{false, true}) {
for (boolean test : new boolean[]{false, true}) {
ComputationGraph net = getTestNet().toComputationGraph();
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
INDArray l = TestUtils.randomOneHot(16, 10);
if (train) {
for (int i = 0; i < 3; i++) {
net.fit(new INDArray[]{f}, new INDArray[]{l});
}
}
if (test) {
for (int i = 0; i < 3; i++) {
net.output(f);
}
}
net.close();
assertTrue(net.params().wasClosed());
if(train) {
assertTrue(net.getGradientsViewArray().wasClosed());
Updater u = net.getUpdater(false);
assertTrue(u.getStateViewArray().wasClosed());
}
//Make sure we don't get crashes etc when trying to use after closing
try {
net.output(f);
} catch (IllegalStateException e) {
String msg = e.getMessage();
assertTrue(msg, msg.contains("released"));
}
try {
net.fit(new INDArray[]{f}, new INDArray[]{l});
} catch (IllegalStateException e) {
String msg = e.getMessage();
assertTrue(msg, msg.contains("released"));
}
}
}
}
}

View File

@ -1035,5 +1035,9 @@ public class TestOptimizers extends BaseDL4JTest {
public boolean updaterDivideByMinibatch(String paramName) { public boolean updaterDivideByMinibatch(String paramName) {
return true; return true;
} }
@Override
public void close(){
}
} }
} }

View File

@ -1055,4 +1055,9 @@ public class BarnesHutTsne implements Model {
} }
@Override
public void close(){
//No-op
}
} }

View File

@ -0,0 +1,4 @@
package org.deeplearning4j.nn.modelimport.keras;
public class Temp {
}

View File

@ -233,4 +233,7 @@ public interface Model {
* Apply any constraints to the model * Apply any constraints to the model
*/ */
void applyConstraints(int iteration, int epoch); void applyConstraints(int iteration, int epoch);
void close();
} }

View File

@ -4824,4 +4824,28 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null) if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null)
this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray()); this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray());
} }
/**
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
* Note that the network should not be used again for any purpose after it has been closed
*/
@Override
public void close(){
//Close the INDArray and dealloc
if(flattenedParams.closeable())
flattenedParams.close();
if(flattenedGradients != null && flattenedGradients.closeable())
flattenedGradients.close();
Updater u = getUpdater(false);
if(u != null && u.getStateViewArray() != null) {
INDArray state = u.getStateViewArray();
if(state.closeable())
state.close();
}
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
System.gc();
}
} }

View File

@ -428,4 +428,9 @@ public abstract class AbstractLayer<LayerConfT extends org.deeplearning4j.nn.con
//Majority of params's gradients should be... Exception: batch norm mean/variance estimate //Majority of params's gradients should be... Exception: batch norm mean/variance estimate
return true; return true;
} }
@Override
public void close(){
//No-op for individual layers
}
} }

View File

@ -599,4 +599,9 @@ public class BidirectionalLayer implements RecurrentLayer {
return ret; return ret;
} }
} }
@Override
public void close(){
//No-op for individual layers
}
} }

View File

@ -1144,4 +1144,9 @@ public class VariationalAutoencoder implements Layer {
} }
} }
} }
@Override
public void close(){
//No-op for individual layers
}
} }

View File

@ -329,4 +329,9 @@ public abstract class BaseWrapperLayer implements Layer {
public boolean updaterDivideByMinibatch(String paramName) { public boolean updaterDivideByMinibatch(String paramName) {
return underlying.updaterDivideByMinibatch(paramName); return underlying.updaterDivideByMinibatch(paramName);
} }
@Override
public void close(){
//No-op for individual layers
}
} }

View File

@ -4085,4 +4085,27 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura
this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray()); this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
} }
/**
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
* Note that the network should not be used again for any purpose after it has been closed
*/
@Override
public void close(){
//Close the INDArray and dealloc
if(flattenedParams.closeable())
flattenedParams.close();
if(flattenedGradients != null && flattenedGradients.closeable())
flattenedGradients.close();
Updater u = getUpdater(false);
if(u != null && u.getStateViewArray() != null) {
INDArray state = u.getStateViewArray();
if(state.closeable())
state.close();
}
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
System.gc();
}
} }

View File

@ -450,6 +450,14 @@ public class DefaultTrainer extends Thread implements Trainer {
} finally { } finally {
log.debug("Terminating all workspaces for trainer_{}", threadId); log.debug("Terminating all workspaces for trainer_{}", threadId);
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread(); Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
if (!onRootModel) {
replicatedModel.close();
}
// let's try to enforce GC to actually clean all references now
replicatedModel.clear();
System.gc();
isStopped.set(true); isStopped.set(true);
} }
} }

View File

@ -58,7 +58,7 @@ public class ParallelWrapperTest extends BaseDL4JTest {
// for GPU you usually want to have higher batchSize // for GPU you usually want to have higher batchSize
int batchSize = 128; int batchSize = 128;
int nEpochs = 2; int nEpochs = 5;
int seed = 123; int seed = 123;
log.info("Load data...."); log.info("Load data....");

View File

@ -1957,6 +1957,9 @@ public abstract class BaseDataBuffer implements DataBuffer {
@Override @Override
public boolean wasClosed() { public boolean wasClosed() {
if (wrappedDataBuffer != null && wrappedDataBuffer != this)
return wrappedDataBuffer.wasClosed();
return released; return released;
} }

View File

@ -71,7 +71,13 @@ public class CudaMemoryManager extends BasicMemoryManager {
return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer(); return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer();
} else if (kind == MemoryKind.DEVICE) { } else if (kind == MemoryKind.DEVICE) {
val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0); val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0);
//log.info("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread()); log.trace("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread());
val ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode();
if (ec != 0) {
val em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage();
throw new RuntimeException(em + "; Bytes: [" + bytes + "]; Error code [" + ec + "]; DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "]");
}
if (ptr == null) if (ptr == null)
throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory"); throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory");

View File

@ -85,6 +85,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
} }
public OpaqueDataBuffer getOpaqueDataBuffer() { public OpaqueDataBuffer getOpaqueDataBuffer() {
if (released)
throw new IllegalStateException("You can't use DataBuffer once it was released");
return ptrDataBuffer; return ptrDataBuffer;
} }
@ -104,7 +107,8 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type, pointer, specialPointer); ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type, pointer, specialPointer);
this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length); this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length);
Nd4j.getDeallocatorService().pickObject(this); Nd4j.getDeallocatorService().pickObject(this);if (released)
throw new IllegalStateException("You can't use DataBuffer once it was released");
} }
/** /**
@ -473,6 +477,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
} }
public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) { public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) {
if (underlyingBuffer.wasClosed())
throw new IllegalStateException("You can't use DataBuffer once it was released");
//this(length, underlyingBuffer.getElementSize(), offset); //this(length, underlyingBuffer.getElementSize(), offset);
this.allocationMode = AllocationMode.MIXED_DATA_TYPES; this.allocationMode = AllocationMode.MIXED_DATA_TYPES;
initTypeAndSize(); initTypeAndSize();

View File

@ -198,4 +198,27 @@ public class BaseCudaDataBufferTest extends BaseND4JTest {
// there shoul dbe no exceptions during execution // there shoul dbe no exceptions during execution
assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get()); assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get());
} }
@Test
public void testClose_1() {
val x = Nd4j.createFromArray(1, 2, 3);
x.close();
assertTrue(x.wasClosed());
assertTrue(x.data().wasClosed());
}
@Test
public void testClose_2() {
val x = Nd4j.create(DataType.FLOAT, 5, 6);
val row = x.getRow(1);
x.close();
assertTrue(x.wasClosed());
assertTrue(x.data().wasClosed());
assertTrue(row.wasClosed());
assertTrue(row.data().wasClosed());
}
} }

View File

@ -61,6 +61,9 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo
} }
public OpaqueDataBuffer getOpaqueDataBuffer() { public OpaqueDataBuffer getOpaqueDataBuffer() {
if (released)
throw new IllegalStateException("You can't use DataBuffer once it was released");
return ptrDataBuffer; return ptrDataBuffer;
} }