From 5d28e6143df90693cace3e3f2428e6c945703845 Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 5 Feb 2020 07:27:24 +0300 Subject: [PATCH] OpContext handling (#214) * nano tweaks Signed-off-by: raver119 * OpContext tweaks Signed-off-by: raver119 * OpContext deallocators Signed-off-by: raver119 * get rid of few mkldnn safety checks Signed-off-by: raver119 * databuffer setSpecial fix Signed-off-by: raver119 --- .../layers/mkldnn/MKLDNNBatchNormHelper.java | 3 +- .../nn/layers/mkldnn/MKLDNNConvHelper.java | 11 ++---- ...KLDNNLocalResponseNormalizationHelper.java | 6 ++-- .../mkldnn/MKLDNNSubsamplingHelper.java | 5 ++- libnd4j/blas/NativeOps.h | 1 + libnd4j/blas/cpu/NativeOps.cpp | 4 +++ libnd4j/blas/cuda/NativeOps.cu | 4 +++ libnd4j/include/array/impl/DataBuffer.cpp | 5 +++ libnd4j/include/graph/Context.h | 7 ++++ libnd4j/include/graph/impl/Context.cpp | 10 ++++++ .../declarable/platform/mkldnn/batchnorm.cpp | 4 --- .../ops/declarable/platform/mkldnn/conv2d.cpp | 4 --- .../ops/declarable/platform/mkldnn/conv3d.cpp | 5 --- .../declarable/platform/mkldnn/deconv2d.cpp | 4 --- .../declarable/platform/mkldnn/deconv3d.cpp | 4 --- .../platform/mkldnn/depthwiseConv2d.cpp | 5 --- .../deallocation/DeallocatorService.java | 2 +- .../nd4j/linalg/api/ops/BaseOpContext.java | 6 ++++ .../org/nd4j/linalg/api/ops/OpContext.java | 5 +++ .../java/org/nd4j/nativeblas/NativeOps.java | 1 + .../ops/executioner/CudaOpContext.java | 32 +++++++++++++++-- .../executioner/CudaOpContextDeallocator.java | 34 +++++++++++++++++++ .../java/org/nd4j/nativeblas/Nd4jCuda.java | 8 +++++ .../nativecpu/buffer/BaseCpuDataBuffer.java | 4 +-- .../cpu/nativecpu/buffer/CpuDeallocator.java | 2 +- .../cpu/nativecpu/buffer/LongBuffer.java | 3 +- .../cpu/nativecpu/ops/CpuOpContext.java | 33 ++++++++++++++++-- .../ops/CpuOpContextDeallocator.java | 34 +++++++++++++++++++ .../java/org/nd4j/nativeblas/Nd4jCpu.java | 8 +++++ .../test/java/org/nd4j/linalg/Nd4jTestsC.java | 25 ++++++++++++++ 30 files changed, 229 insertions(+), 50 deletions(-) create mode 100644 nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContextDeallocator.java create mode 100644 nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContextDeallocator.java diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java index 2e8c04aa3..027f9d80d 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java @@ -147,8 +147,7 @@ public class MKLDNNBatchNormHelper implements BatchNormalizationHelper { } //Note: batchnorm op expects rank 1 inputs for mean/var etc, not rank 2 shape [1,x] - context.getInputArrays().clear(); - context.getOutputArrays().clear(); + context.purge(); context.setInputArray(0, x); context.setInputArray(1, m); context.setInputArray(2, v); diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNConvHelper.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNConvHelper.java index 244f7c1fc..9bbf4deae 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNConvHelper.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNConvHelper.java @@ -89,8 +89,7 @@ public class MKLDNNConvHelper implements ConvolutionHelper { INDArray[] inputsArr = biasGradView == null ? new INDArray[]{input, weightsPermute, delta} : new INDArray[]{input, weightsPermute, bias, delta}; INDArray[] outputArr = biasGradView == null ? new INDArray[]{gradAtInput, weightGradViewPermute} : new INDArray[]{gradAtInput, weightGradViewPermute, biasGradView}; - contextBwd.getInputArrays().clear(); - contextBwd.getOutputArrays().clear(); + contextBwd.purge(); for( int i=0; isetExecutionMode((samediff::ExecutionMode) execMode); } +void ctxPurge(OpaqueContext* ptr) { + ptr->clearFastPath(); +} + nd4j::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) { return new nd4j::graph::RandomGenerator(rootSeed, nodeSeed); } diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index d65dcaed5..07ce876ea 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -3771,6 +3771,10 @@ void ctxShapeFunctionOverride(OpaqueContext* ptr, bool reallyOverride) { ptr->setShapeFunctionOverride(reallyOverride); } +void ctxPurge(OpaqueContext* ptr) { + ptr->clearFastPath(); +} + int binaryLevel() { return 0; } diff --git a/libnd4j/include/array/impl/DataBuffer.cpp b/libnd4j/include/array/impl/DataBuffer.cpp index 49527026c..36758c684 100644 --- a/libnd4j/include/array/impl/DataBuffer.cpp +++ b/libnd4j/include/array/impl/DataBuffer.cpp @@ -305,12 +305,17 @@ namespace nd4j { if (_primaryBuffer != nullptr && _isOwnerPrimary) { deletePrimary(); } + _primaryBuffer = buffer; _isOwnerPrimary = false; _lenInBytes = length * DataTypeUtils::sizeOf(_dataType); } void DataBuffer::setSpecialBuffer(void *buffer, size_t length) { + if (_specialBuffer != nullptr && _isOwnerSpecial) { + deleteSpecial(); + } + this->setSpecial(buffer, false); _lenInBytes = length * DataTypeUtils::sizeOf(_dataType); } diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h index 96b7e1c79..d1e8a4dad 100644 --- a/libnd4j/include/graph/Context.h +++ b/libnd4j/include/graph/Context.h @@ -204,6 +204,13 @@ namespace nd4j { void setBArguments(const std::vector &tArgs); void setDArguments(const std::vector &dArgs); + /** + * This method purges fastpath in/out contents and releases all the handles. + * + * PLEASE NOTE: I/T/B/D args will stay intact + */ + void clearFastPath(); + void setCudaContext(Nd4jPointer cudaStream, Nd4jPointer reductionPointer, Nd4jPointer allocationPointer); void allowHelpers(bool reallyAllow); diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp index 4c7a19133..5add8280d 100644 --- a/libnd4j/include/graph/impl/Context.cpp +++ b/libnd4j/include/graph/impl/Context.cpp @@ -563,6 +563,16 @@ namespace nd4j { for (auto d:dArgs) _dArgs.emplace_back(d); } + + void Context::clearFastPath() { + _fastpath_in.clear(); + _fastpath_out.clear(); + + for (auto v:_handles) + delete v; + + _handles.clear(); + } } } diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp index 8974cef14..0ebee8fbf 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp @@ -456,10 +456,6 @@ PLATFORM_IMPL(batchnorm, ENGINE_CPU) { ////////////////////////////////////////////////////////////////////////// PLATFORM_CHECK(batchnorm, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - // if (::optimalLevel() < 2) - // return false; - auto input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw auto mean = INPUT_VARIABLE(1); // [c] auto variance = INPUT_VARIABLE(2); // [c] diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp index 559edf2cd..1b90812b1 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp @@ -265,10 +265,6 @@ PLATFORM_IMPL(conv2d, ENGINE_CPU) { } PLATFORM_CHECK(conv2d, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - if (::optimalLevel() < 2) - return false; - auto input = INPUT_VARIABLE(0); auto weights = INPUT_VARIABLE(1); diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp index 747d84c36..096839d79 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp @@ -270,10 +270,6 @@ PLATFORM_IMPL(conv3dnew, ENGINE_CPU) { } PLATFORM_CHECK(conv3dnew, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - if (::optimalLevel() < 2) - return false; - auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] @@ -335,7 +331,6 @@ PLATFORM_IMPL(conv3dnew_bp, ENGINE_CPU) { } PLATFORM_CHECK(conv3dnew_bp, ENGINE_CPU) { - auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp index d95052c5a..e63d7440c 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp @@ -407,10 +407,6 @@ PLATFORM_IMPL(deconv2d, ENGINE_CPU) { } PLATFORM_CHECK(deconv2d, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - // if (::optimalLevel() < 2) - // return false; - auto input = INPUT_VARIABLE(0); auto weights = INPUT_VARIABLE(1); auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp index a678e0185..490ce4535 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp @@ -422,10 +422,6 @@ PLATFORM_IMPL(deconv3d, ENGINE_CPU) { } PLATFORM_CHECK(deconv3d, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - // if (::optimalLevel() < 2) - // return false; - auto input = INPUT_VARIABLE(0); auto weights = INPUT_VARIABLE(1); auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp index fc7a1e9e3..d6722c009 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp @@ -401,10 +401,6 @@ PLATFORM_IMPL(depthwise_conv2d, ENGINE_CPU) { ////////////////////////////////////////////////////////////////////// PLATFORM_CHECK(depthwise_conv2d, ENGINE_CPU) { - // we don't want to use mkldnn if cpu doesn't support avx/avx2 - if (::optimalLevel() < 2) - return false; - auto input = INPUT_VARIABLE(0); auto weights = INPUT_VARIABLE(1); auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; @@ -477,7 +473,6 @@ PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CPU) { ////////////////////////////////////////////////////////////////////// PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CPU) { - auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW) auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] = [iC*mC] diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/deallocation/DeallocatorService.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/deallocation/DeallocatorService.java index dffb93a7b..ded5bc938 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/deallocation/DeallocatorService.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/memory/deallocation/DeallocatorService.java @@ -43,7 +43,7 @@ public class DeallocatorService { private Map referenceMap = new ConcurrentHashMap<>(); private List>> deviceMap = new ArrayList<>(); - private AtomicLong counter = new AtomicLong(0); + private final transient AtomicLong counter = new AtomicLong(0); public DeallocatorService() { // we need to have at least 2 threads, but for CUDA we'd need at least numDevices threads, due to thread->device affinity diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/BaseOpContext.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/BaseOpContext.java index 4a56e2a88..0139a9db5 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/BaseOpContext.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/BaseOpContext.java @@ -153,4 +153,10 @@ public abstract class BaseOpContext implements OpContext { for (int e = 0; e < arrays.length; e++) setOutputArray(e, arrays[e]); } + + @Override + public void purge() { + fastpath_in.clear(); + fastpath_out.clear(); + } } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java index 4063746b3..62a4906a7 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java @@ -162,4 +162,9 @@ public interface OpContext extends AutoCloseable { * @param mode */ void setExecutionMode(ExecutionMode mode); + + /** + * This method removes all in/out arrays from this OpContext + */ + void purge(); } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java index d284974eb..1a01bf278 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java @@ -1161,6 +1161,7 @@ public interface NativeOps { void ctxAllowHelpers(OpaqueContext ptr, boolean reallyAllow); void ctxSetExecutionMode(OpaqueContext ptr, int execMode); void ctxShapeFunctionOverride(OpaqueContext ptr, boolean reallyOverride); + void ctxPurge(OpaqueContext ptr); void deleteGraphContext(OpaqueContext ptr); OpaqueRandomGenerator createRandomGenerator(long rootSeed, long nodeSeed); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java index 01127e891..5e26b3ea3 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java @@ -23,6 +23,8 @@ import org.nd4j.jita.allocator.impl.AtomicAllocator; import org.nd4j.jita.allocator.pointers.cuda.cudaStream_t; import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.concurrency.AffinityManager; +import org.nd4j.linalg.api.memory.Deallocatable; +import org.nd4j.linalg.api.memory.Deallocator; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.BaseOpContext; import org.nd4j.linalg.api.ops.ExecutionMode; @@ -40,14 +42,19 @@ import org.nd4j.nativeblas.OpaqueRandomGenerator; * CUDA wrapper for op Context * @author raver119@gmail.com */ -public class CudaOpContext extends BaseOpContext implements OpContext { +public class CudaOpContext extends BaseOpContext implements OpContext, Deallocatable { // we might want to have configurable private NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps(); private OpaqueContext context = nativeOps.createGraphContext(1); + private final transient long id = Nd4j.getDeallocatorService().nextValue(); + + public CudaOpContext() { + Nd4j.getDeallocatorService().pickObject(this); + } @Override public void close() { - nativeOps.deleteGraphContext(context); + // no-op } @Override @@ -143,4 +150,25 @@ public class CudaOpContext extends BaseOpContext implements OpContext { super.setExecutionMode(mode); nativeOps.ctxSetExecutionMode(context, mode.ordinal()); } + + @Override + public void purge() { + super.purge(); + nativeOps.ctxPurge(context); + } + + @Override + public String getUniqueId() { + return new String("CTX_" + id); + } + + @Override + public Deallocator deallocator() { + return new CudaOpContextDeallocator(this); + } + + @Override + public int targetDevice() { + return 0; + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContextDeallocator.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContextDeallocator.java new file mode 100644 index 000000000..62b5e4a00 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContextDeallocator.java @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.jcublas.ops.executioner; + +import org.nd4j.linalg.api.memory.Deallocator; +import org.nd4j.nativeblas.NativeOpsHolder; +import org.nd4j.nativeblas.OpaqueContext; + +public class CudaOpContextDeallocator implements Deallocator { + private transient final OpaqueContext context; + + public CudaOpContextDeallocator(CudaOpContext ctx) { + context = (OpaqueContext) ctx.contextPointer(); + } + + @Override + public void deallocate() { + NativeOpsHolder.getInstance().getDeviceNativeOps().deleteGraphContext(context); + } +} diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index f85ae9cf1..e7ddcda11 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -3090,6 +3090,7 @@ public native OpaqueRandomGenerator getGraphContextRandomGenerator(OpaqueContext public native void ctxAllowHelpers(OpaqueContext ptr, @Cast("bool") boolean reallyAllow); public native void ctxShapeFunctionOverride(OpaqueContext ptr, @Cast("bool") boolean reallyOverride); public native void ctxSetExecutionMode(OpaqueContext ptr, int execMode); +public native void ctxPurge(OpaqueContext ptr); public native void markGraphContextInplace(OpaqueContext ptr, @Cast("bool") boolean reallyInplace); public native void setGraphContextCudaContext(OpaqueContext ptr, Pointer stream, Pointer reductionPointer, Pointer allocationPointer); public native void setGraphContextInputArray(OpaqueContext ptr, int index, Pointer buffer, Pointer shapeInfo, Pointer specialBuffer, Pointer specialShapeInfo); @@ -6453,6 +6454,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntBuffer dArgs); public native void setDArguments(@Cast("nd4j::DataType*") @StdVector int[] dArgs); + /** + * This method purges fastpath in/out contents and releases all the handles. + * + * PLEASE NOTE: I/T/B/D args will stay intact + */ + public native void clearFastPath(); + public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer); public native void allowHelpers(@Cast("bool") boolean reallyAllow); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/BaseCpuDataBuffer.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/BaseCpuDataBuffer.java index 71583638a..a51666f78 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/BaseCpuDataBuffer.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/BaseCpuDataBuffer.java @@ -43,7 +43,7 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo protected transient OpaqueDataBuffer ptrDataBuffer; - private final long instanceId = Nd4j.getDeallocatorService().nextValue(); + private transient final long instanceId = Nd4j.getDeallocatorService().nextValue(); protected BaseCpuDataBuffer() { @@ -52,7 +52,7 @@ public abstract class BaseCpuDataBuffer extends BaseDataBuffer implements Deallo @Override public String getUniqueId() { - return "BCDB_" + instanceId; + return new String("BCDB_" + instanceId); } @Override diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/CpuDeallocator.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/CpuDeallocator.java index 3b8a46fa6..e808ebaa3 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/CpuDeallocator.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/CpuDeallocator.java @@ -28,7 +28,7 @@ import org.nd4j.nativeblas.OpaqueDataBuffer; */ @Slf4j public class CpuDeallocator implements Deallocator { - private OpaqueDataBuffer opaqueDataBuffer; + private final transient OpaqueDataBuffer opaqueDataBuffer; public CpuDeallocator(BaseCpuDataBuffer buffer) { opaqueDataBuffer = buffer.getOpaqueDataBuffer(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/LongBuffer.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/LongBuffer.java index 898a125f2..19ad6f907 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/LongBuffer.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/buffer/LongBuffer.java @@ -28,6 +28,7 @@ import org.nd4j.linalg.api.memory.MemoryWorkspace; import org.nd4j.linalg.api.memory.pointers.PagedPointer; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.nativeblas.NativeOpsHolder; +import org.nd4j.nativeblas.OpaqueDataBuffer; import java.nio.ByteBuffer; @@ -123,7 +124,7 @@ public class LongBuffer extends BaseCpuDataBuffer { // we still want this buffer to have native representation - ptrDataBuffer = NativeOpsHolder.getInstance().getDeviceNativeOps().allocateDataBuffer(0, DataType.INT64.toInt(), false); + ptrDataBuffer = OpaqueDataBuffer.allocateDataBuffer(0, DataType.INT64, false); NativeOpsHolder.getInstance().getDeviceNativeOps().dbSetPrimaryBuffer(ptrDataBuffer, this.pointer, numberOfElements); Nd4j.getDeallocatorService().pickObject(this); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java index 461646311..9d79e6545 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java @@ -20,11 +20,14 @@ import lombok.NonNull; import lombok.val; import org.bytedeco.javacpp.*; import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.memory.Deallocatable; +import org.nd4j.linalg.api.memory.Deallocator; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.BaseOpContext; import org.nd4j.linalg.api.ops.ExecutionMode; import org.nd4j.linalg.api.ops.OpContext; import org.nd4j.linalg.cpu.nativecpu.buffer.BaseCpuDataBuffer; +import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.primitives.Pair; import org.nd4j.nativeblas.NativeOps; import org.nd4j.nativeblas.NativeOpsHolder; @@ -38,14 +41,19 @@ import java.util.List; * * @author raver119@gmail.com */ -public class CpuOpContext extends BaseOpContext implements OpContext { +public class CpuOpContext extends BaseOpContext implements OpContext, Deallocatable { // we might want to have configurable private NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps(); private OpaqueContext context = nativeOps.createGraphContext(1); + private final transient long id = Nd4j.getDeallocatorService().nextValue(); + + public CpuOpContext() { + Nd4j.getDeallocatorService().pickObject(this); + } @Override public void close() { - nativeOps.deleteGraphContext(context); + // no-op } @Override @@ -136,4 +144,25 @@ public class CpuOpContext extends BaseOpContext implements OpContext { super.setExecutionMode(mode); nativeOps.ctxSetExecutionMode(context, mode.ordinal()); } + + @Override + public void purge() { + super.purge(); + nativeOps.ctxPurge(context); + } + + @Override + public String getUniqueId() { + return new String("CTX_" + id); + } + + @Override + public Deallocator deallocator() { + return new CpuOpContextDeallocator(this); + } + + @Override + public int targetDevice() { + return 0; + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContextDeallocator.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContextDeallocator.java new file mode 100644 index 000000000..621f882bd --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContextDeallocator.java @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.cpu.nativecpu.ops; + +import org.nd4j.linalg.api.memory.Deallocator; +import org.nd4j.nativeblas.NativeOpsHolder; +import org.nd4j.nativeblas.OpaqueContext; + +public class CpuOpContextDeallocator implements Deallocator { + private transient final OpaqueContext context; + + public CpuOpContextDeallocator(CpuOpContext ctx) { + context = (OpaqueContext) ctx.contextPointer(); + } + + @Override + public void deallocate() { + NativeOpsHolder.getInstance().getDeviceNativeOps().deleteGraphContext(context); + } +} diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index 5522141be..b954a4a34 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -3093,6 +3093,7 @@ public native OpaqueRandomGenerator getGraphContextRandomGenerator(OpaqueContext public native void ctxAllowHelpers(OpaqueContext ptr, @Cast("bool") boolean reallyAllow); public native void ctxShapeFunctionOverride(OpaqueContext ptr, @Cast("bool") boolean reallyOverride); public native void ctxSetExecutionMode(OpaqueContext ptr, int execMode); +public native void ctxPurge(OpaqueContext ptr); public native void markGraphContextInplace(OpaqueContext ptr, @Cast("bool") boolean reallyInplace); public native void setGraphContextCudaContext(OpaqueContext ptr, Pointer stream, Pointer reductionPointer, Pointer allocationPointer); public native void setGraphContextInputArray(OpaqueContext ptr, int index, Pointer buffer, Pointer shapeInfo, Pointer specialBuffer, Pointer specialShapeInfo); @@ -6456,6 +6457,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntBuffer dArgs); public native void setDArguments(@Cast("nd4j::DataType*") @StdVector int[] dArgs); + /** + * This method purges fastpath in/out contents and releases all the handles. + * + * PLEASE NOTE: I/T/B/D args will stay intact + */ + public native void clearFastPath(); + public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer); public native void allowHelpers(@Cast("bool") boolean reallyAllow); diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java index d96c0ed31..ad5bacc4e 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java @@ -8262,6 +8262,31 @@ public class Nd4jTestsC extends BaseNd4jTest { assertArrayEquals(new long[]{10, 0}, out2.shape()); } + @Test + public void testDealloc_1() throws Exception { + + for (int e = 0; e < 5000; e++){ + try(val ws = Nd4j.getWorkspaceManager().getAndActivateWorkspace("someid")) { + val x = Nd4j.createUninitialized(DataType.FLOAT, 1, 1000); + //val y = x.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, 100)).reshape('c', 10, 10); + //val z = x.get(NDArrayIndex.point(0), NDArrayIndex.interval(100, 200)).reshape('c', 10, 10); + //val a = x.get(NDArrayIndex.point(0), NDArrayIndex.interval(200, 300)).reshape('f', 10, 10); + } finally { + //System.gc(); + } + } + + Thread.sleep(1000); + System.gc(); + + Thread.sleep(1000); + System.gc(); + System.gc(); + System.gc(); + + //Nd4j.getMemoryManager().printRemainingStacks(); + } + @Override public char ordering() { return 'c';