[WIP] Few more pre-release fixes (#461)
* error code check in CudaMemoryManager Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear Signed-off-by: raver119@gmail.com <raver119@gmail.com> * clear model before exiting Signed-off-by: raver119@gmail.com <raver119@gmail.com> * MultiLayerNetwork/ComputationGraph.close() [WIP] (#460) * MultiLayerNetwork/ComputationGraph.close() Signed-off-by: Alex Black <blacka101@gmail.com> * Copyright header Signed-off-by: Alex Black <blacka101@gmail.com> * Fix Signed-off-by: Alex Black <blacka101@gmail.com> * - fix for handling release of nested DataBuffers - couple of additional tests for released DataBuffers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * PW test: increase number of epochs slightly Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: Alex Black <blacka101@gmail.com>master
parent
1ce65fced4
commit
60f103fb03
|
@ -0,0 +1,151 @@
|
||||||
|
/* ******************************************************************************
|
||||||
|
* Copyright (c) 2020 Konduit K.K.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
package org.deeplearning4j.nn.misc;
|
||||||
|
|
||||||
|
import org.deeplearning4j.BaseDL4JTest;
|
||||||
|
import org.deeplearning4j.TestUtils;
|
||||||
|
import org.deeplearning4j.nn.api.Updater;
|
||||||
|
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
|
||||||
|
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
|
||||||
|
import org.deeplearning4j.nn.conf.inputs.InputType;
|
||||||
|
import org.deeplearning4j.nn.conf.layers.*;
|
||||||
|
import org.deeplearning4j.nn.graph.ComputationGraph;
|
||||||
|
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.nd4j.linalg.activations.Activation;
|
||||||
|
import org.nd4j.linalg.api.buffer.DataType;
|
||||||
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||||
|
import org.nd4j.linalg.factory.Nd4j;
|
||||||
|
import org.nd4j.linalg.learning.config.Adam;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
public class CloseNetworkTests extends BaseDL4JTest {
|
||||||
|
|
||||||
|
public static MultiLayerNetwork getTestNet() {
|
||||||
|
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
|
||||||
|
.updater(new Adam(1e-3))
|
||||||
|
.list()
|
||||||
|
.layer(new ConvolutionLayer.Builder().nOut(5).kernelSize(3, 3).activation(Activation.TANH).build())
|
||||||
|
.layer(new BatchNormalization.Builder().nOut(5).build())
|
||||||
|
.layer(new SubsamplingLayer.Builder().build())
|
||||||
|
.layer(new DenseLayer.Builder().nOut(10).activation(Activation.RELU).build())
|
||||||
|
.layer(new OutputLayer.Builder().nOut(10).build())
|
||||||
|
.setInputType(InputType.convolutional(28, 28, 1))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
MultiLayerNetwork net = new MultiLayerNetwork(conf);
|
||||||
|
net.init();
|
||||||
|
|
||||||
|
return net;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCloseMLN() {
|
||||||
|
for (boolean train : new boolean[]{false, true}) {
|
||||||
|
for (boolean test : new boolean[]{false, true}) {
|
||||||
|
MultiLayerNetwork net = getTestNet();
|
||||||
|
|
||||||
|
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
|
||||||
|
INDArray l = TestUtils.randomOneHot(16, 10);
|
||||||
|
|
||||||
|
if (train) {
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
net.fit(f, l);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (test) {
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
net.output(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
net.close();
|
||||||
|
|
||||||
|
assertTrue(net.params().wasClosed());
|
||||||
|
if(train) {
|
||||||
|
assertTrue(net.getGradientsViewArray().wasClosed());
|
||||||
|
Updater u = net.getUpdater(false);
|
||||||
|
assertTrue(u.getStateViewArray().wasClosed());
|
||||||
|
}
|
||||||
|
|
||||||
|
//Make sure we don't get crashes etc when trying to use after closing
|
||||||
|
try {
|
||||||
|
net.output(f);
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
String msg = e.getMessage();
|
||||||
|
assertTrue(msg, msg.contains("released"));
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
net.fit(f, l);
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
String msg = e.getMessage();
|
||||||
|
assertTrue(msg, msg.contains("released"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCloseCG() {
|
||||||
|
for (boolean train : new boolean[]{false, true}) {
|
||||||
|
for (boolean test : new boolean[]{false, true}) {
|
||||||
|
ComputationGraph net = getTestNet().toComputationGraph();
|
||||||
|
|
||||||
|
INDArray f = Nd4j.rand(DataType.FLOAT, 16, 1, 28, 28);
|
||||||
|
INDArray l = TestUtils.randomOneHot(16, 10);
|
||||||
|
|
||||||
|
if (train) {
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
net.fit(new INDArray[]{f}, new INDArray[]{l});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (test) {
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
net.output(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
net.close();
|
||||||
|
|
||||||
|
assertTrue(net.params().wasClosed());
|
||||||
|
if(train) {
|
||||||
|
assertTrue(net.getGradientsViewArray().wasClosed());
|
||||||
|
Updater u = net.getUpdater(false);
|
||||||
|
assertTrue(u.getStateViewArray().wasClosed());
|
||||||
|
}
|
||||||
|
|
||||||
|
//Make sure we don't get crashes etc when trying to use after closing
|
||||||
|
try {
|
||||||
|
net.output(f);
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
String msg = e.getMessage();
|
||||||
|
assertTrue(msg, msg.contains("released"));
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
net.fit(new INDArray[]{f}, new INDArray[]{l});
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
String msg = e.getMessage();
|
||||||
|
assertTrue(msg, msg.contains("released"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1035,5 +1035,9 @@ public class TestOptimizers extends BaseDL4JTest {
|
||||||
public boolean updaterDivideByMinibatch(String paramName) {
|
public boolean updaterDivideByMinibatch(String paramName) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1055,4 +1055,9 @@ public class BarnesHutTsne implements Model {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//No-op
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
package org.deeplearning4j.nn.modelimport.keras;
|
||||||
|
|
||||||
|
public class Temp {
|
||||||
|
}
|
|
@ -233,4 +233,7 @@ public interface Model {
|
||||||
* Apply any constraints to the model
|
* Apply any constraints to the model
|
||||||
*/
|
*/
|
||||||
void applyConstraints(int iteration, int epoch);
|
void applyConstraints(int iteration, int epoch);
|
||||||
|
|
||||||
|
|
||||||
|
void close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -4824,4 +4824,28 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
|
||||||
if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null)
|
if (cg.getUpdater() != null && cg.getUpdater(false).getStateViewArray() != null)
|
||||||
this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray());
|
this.getUpdater(true).getStateViewArray().assign(cg.getUpdater(false).getStateViewArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
|
||||||
|
* Note that the network should not be used again for any purpose after it has been closed
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//Close the INDArray and dealloc
|
||||||
|
if(flattenedParams.closeable())
|
||||||
|
flattenedParams.close();
|
||||||
|
|
||||||
|
if(flattenedGradients != null && flattenedGradients.closeable())
|
||||||
|
flattenedGradients.close();
|
||||||
|
|
||||||
|
Updater u = getUpdater(false);
|
||||||
|
if(u != null && u.getStateViewArray() != null) {
|
||||||
|
INDArray state = u.getStateViewArray();
|
||||||
|
if(state.closeable())
|
||||||
|
state.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||||
|
System.gc();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -428,4 +428,9 @@ public abstract class AbstractLayer<LayerConfT extends org.deeplearning4j.nn.con
|
||||||
//Majority of params's gradients should be... Exception: batch norm mean/variance estimate
|
//Majority of params's gradients should be... Exception: batch norm mean/variance estimate
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//No-op for individual layers
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -599,4 +599,9 @@ public class BidirectionalLayer implements RecurrentLayer {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//No-op for individual layers
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1144,4 +1144,9 @@ public class VariationalAutoencoder implements Layer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//No-op for individual layers
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -329,4 +329,9 @@ public abstract class BaseWrapperLayer implements Layer {
|
||||||
public boolean updaterDivideByMinibatch(String paramName) {
|
public boolean updaterDivideByMinibatch(String paramName) {
|
||||||
return underlying.updaterDivideByMinibatch(paramName);
|
return underlying.updaterDivideByMinibatch(paramName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//No-op for individual layers
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4085,4 +4085,27 @@ public class MultiLayerNetwork implements Serializable, Classifier, Layer, Neura
|
||||||
this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
|
this.getUpdater(true).getStateViewArray().assign(mln.getUpdater(false).getStateViewArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the network and deallocate all native memory, including: parameters, gradients, updater memory and workspaces
|
||||||
|
* Note that the network should not be used again for any purpose after it has been closed
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void close(){
|
||||||
|
//Close the INDArray and dealloc
|
||||||
|
if(flattenedParams.closeable())
|
||||||
|
flattenedParams.close();
|
||||||
|
|
||||||
|
if(flattenedGradients != null && flattenedGradients.closeable())
|
||||||
|
flattenedGradients.close();
|
||||||
|
|
||||||
|
Updater u = getUpdater(false);
|
||||||
|
if(u != null && u.getStateViewArray() != null) {
|
||||||
|
INDArray state = u.getStateViewArray();
|
||||||
|
if(state.closeable())
|
||||||
|
state.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||||
|
System.gc();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -450,6 +450,14 @@ public class DefaultTrainer extends Thread implements Trainer {
|
||||||
} finally {
|
} finally {
|
||||||
log.debug("Terminating all workspaces for trainer_{}", threadId);
|
log.debug("Terminating all workspaces for trainer_{}", threadId);
|
||||||
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
Nd4j.getWorkspaceManager().destroyAllWorkspacesForCurrentThread();
|
||||||
|
|
||||||
|
if (!onRootModel) {
|
||||||
|
replicatedModel.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// let's try to enforce GC to actually clean all references now
|
||||||
|
replicatedModel.clear();
|
||||||
|
System.gc();
|
||||||
isStopped.set(true);
|
isStopped.set(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,7 +58,7 @@ public class ParallelWrapperTest extends BaseDL4JTest {
|
||||||
|
|
||||||
// for GPU you usually want to have higher batchSize
|
// for GPU you usually want to have higher batchSize
|
||||||
int batchSize = 128;
|
int batchSize = 128;
|
||||||
int nEpochs = 2;
|
int nEpochs = 5;
|
||||||
int seed = 123;
|
int seed = 123;
|
||||||
|
|
||||||
log.info("Load data....");
|
log.info("Load data....");
|
||||||
|
|
|
@ -1957,6 +1957,9 @@ public abstract class BaseDataBuffer implements DataBuffer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean wasClosed() {
|
public boolean wasClosed() {
|
||||||
|
if (wrappedDataBuffer != null && wrappedDataBuffer != this)
|
||||||
|
return wrappedDataBuffer.wasClosed();
|
||||||
|
|
||||||
return released;
|
return released;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,13 @@ public class CudaMemoryManager extends BasicMemoryManager {
|
||||||
return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer();
|
return ptr;//allocator.getMemoryHandler().alloc(AllocationStatus.HOST, null, null, initialize).getHostPointer();
|
||||||
} else if (kind == MemoryKind.DEVICE) {
|
} else if (kind == MemoryKind.DEVICE) {
|
||||||
val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0);
|
val ptr = NativeOpsHolder.getInstance().getDeviceNativeOps().mallocDevice(bytes, 0, 0);
|
||||||
//log.info("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread());
|
log.trace("Allocating {} bytes for device_{}", bytes, Nd4j.getAffinityManager().getDeviceForCurrentThread());
|
||||||
|
|
||||||
|
val ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode();
|
||||||
|
if (ec != 0) {
|
||||||
|
val em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage();
|
||||||
|
throw new RuntimeException(em + "; Bytes: [" + bytes + "]; Error code [" + ec + "]; DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "]");
|
||||||
|
}
|
||||||
|
|
||||||
if (ptr == null)
|
if (ptr == null)
|
||||||
throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory");
|
throw new RuntimeException("Failed to allocate " + bytes + " bytes from DEVICE [" + Nd4j.getAffinityManager().getDeviceForCurrentThread() + "] memory");
|
||||||
|
|
|
@ -85,6 +85,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
|
|
||||||
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
||||||
|
if (released)
|
||||||
|
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||||
|
|
||||||
return ptrDataBuffer;
|
return ptrDataBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,7 +107,8 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type, pointer, specialPointer);
|
ptrDataBuffer = OpaqueDataBuffer.externalizedDataBuffer(length, this.type, pointer, specialPointer);
|
||||||
this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length);
|
this.allocationPoint = new AllocationPoint(ptrDataBuffer, this.type.width() * length);
|
||||||
|
|
||||||
Nd4j.getDeallocatorService().pickObject(this);
|
Nd4j.getDeallocatorService().pickObject(this);if (released)
|
||||||
|
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -473,6 +477,9 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
|
|
||||||
public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) {
|
public BaseCudaDataBuffer(@NonNull DataBuffer underlyingBuffer, long length, long offset) {
|
||||||
|
if (underlyingBuffer.wasClosed())
|
||||||
|
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||||
|
|
||||||
//this(length, underlyingBuffer.getElementSize(), offset);
|
//this(length, underlyingBuffer.getElementSize(), offset);
|
||||||
this.allocationMode = AllocationMode.MIXED_DATA_TYPES;
|
this.allocationMode = AllocationMode.MIXED_DATA_TYPES;
|
||||||
initTypeAndSize();
|
initTypeAndSize();
|
||||||
|
|
|
@ -198,4 +198,27 @@ public class BaseCudaDataBufferTest extends BaseND4JTest {
|
||||||
// there shoul dbe no exceptions during execution
|
// there shoul dbe no exceptions during execution
|
||||||
assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get());
|
assertEquals(Nd4j.getAffinityManager().getNumberOfDevices(), cnt.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClose_1() {
|
||||||
|
val x = Nd4j.createFromArray(1, 2, 3);
|
||||||
|
|
||||||
|
x.close();
|
||||||
|
|
||||||
|
assertTrue(x.wasClosed());
|
||||||
|
assertTrue(x.data().wasClosed());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClose_2() {
|
||||||
|
val x = Nd4j.create(DataType.FLOAT, 5, 6);
|
||||||
|
val row = x.getRow(1);
|
||||||
|
x.close();
|
||||||
|
|
||||||
|
assertTrue(x.wasClosed());
|
||||||
|
assertTrue(x.data().wasClosed());
|
||||||
|
|
||||||
|
assertTrue(row.wasClosed());
|
||||||
|
assertTrue(row.data().wasClosed());
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -61,6 +61,9 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo
|
||||||
}
|
}
|
||||||
|
|
||||||
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
public OpaqueDataBuffer getOpaqueDataBuffer() {
|
||||||
|
if (released)
|
||||||
|
throw new IllegalStateException("You can't use DataBuffer once it was released");
|
||||||
|
|
||||||
return ptrDataBuffer;
|
return ptrDataBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue