From 5d6906917743e108c3e8a30f47c150d33137a7e0 Mon Sep 17 00:00:00 2001 From: raver119 Date: Fri, 24 Jan 2020 10:11:09 +0300 Subject: [PATCH] [WIP] Memory limits (#167) * initial commit Signed-off-by: raver119 * one more initial commit Signed-off-by: raver119 * additional initial commit Signed-off-by: raver119 * subsequent initial commit Signed-off-by: raver119 * initial commit testing Signed-off-by: raver119 * initial commit per device Signed-off-by: raver119 * initial commit per group Signed-off-by: raver119 * initial commit for cuda Signed-off-by: raver119 * initial commit for cuda + few missed lines Signed-off-by: raver119 * initial commit for cuda + missed includes Signed-off-by: raver119 * initial commit for cuda + one more missed include Signed-off-by: raver119 * initial commit shouldn't count host mem as dev0 in cuda Signed-off-by: raver119 * initial commit that tracks HOST group limits for CUDA Signed-off-by: raver119 * initial commit with some Environment changes Signed-off-by: raver119 * initial commit with more Environment changes Signed-off-by: raver119 * initial commit with maxMasterThreads fix Signed-off-by: raver119 * initial commit with maxMasterThreads fix Signed-off-by: raver119 * initial commit without maxMasterThreads exception Signed-off-by: raver119 * initial commit without Nd4jULong in Environment Signed-off-by: raver119 * add sleep and more iterations for OOM cases Signed-off-by: raver119 * limits propagation from java side Signed-off-by: raver119 * - consume ErrorCode every time - one test for memory limits Signed-off-by: raver119 * unordered_map Signed-off-by: raver119 * unordered_map Signed-off-by: raver119 * unordered_map Signed-off-by: raver119 * RSub op mapping fixed Signed-off-by: raver119 * typo fixed Signed-off-by: raver119 * one bad test fixed Signed-off-by: raver119 --- libnd4j/blas/Environment.cpp | 43 +++++- libnd4j/blas/Environment.h | 21 +++ libnd4j/blas/NDArray.h | 1 + libnd4j/blas/NativeOps.h | 1 + libnd4j/blas/cpu/NativeOps.cpp | 17 +- libnd4j/blas/cuda/NativeOps.cu | 17 +- libnd4j/include/array/ConstantDescriptor.h | 2 +- libnd4j/include/array/NDArrayList.h | 2 +- libnd4j/include/array/ShapeDescriptor.h | 2 +- libnd4j/include/array/cuda/DataBuffer.cu | 15 ++ libnd4j/include/array/impl/DataBuffer.cpp | 34 +++- .../include/exceptions/allocation_exception.h | 1 + .../exceptions/impl/allocation_exception.cpp | 7 + libnd4j/include/graph/ExecutionResult.h | 2 +- libnd4j/include/graph/Graph.h | 2 +- libnd4j/include/graph/GraphHolder.h | 2 +- libnd4j/include/graph/GraphState.h | 2 +- libnd4j/include/graph/Scope.h | 2 +- libnd4j/include/graph/Stash.h | 2 +- libnd4j/include/graph/VariableSpace.h | 2 +- libnd4j/include/memory/MemoryCounter.h | 146 ++++++++++++++++++ libnd4j/include/memory/MemoryTracker.h | 3 + libnd4j/include/memory/impl/MemoryCounter.cpp | 133 ++++++++++++++++ .../include/ops/declarable/OpRegistrator.h | 2 +- .../ops/declarable/helpers/cpu/segment.cpp | 2 +- .../layers_tests/DataBufferTests.cpp | 78 ++++++++++ .../layers_tests/DataBufferTestsCuda.cu | 87 +++++++++++ .../layers_tests/DataTypesValidationTests.cpp | 2 - .../pairwise/arithmetic/RSubOp.java | 2 +- .../org/nd4j/nativeblas/OpaqueDataBuffer.java | 24 ++- .../org/nd4j/nativeblas/CudaEnvironment.java | 25 +++ .../java/org/nd4j/nativeblas/Nd4jCuda.java | 38 ++++- .../linalg/cpu/nativecpu/CpuEnvironment.java | 25 +++ .../java/org/nd4j/nativeblas/Nd4jCpu.java | 68 +++++++- .../test/java/org/nd4j/linalg/Nd4jTestsC.java | 2 +- .../nd4j/linalg/api/TestNDArrayCreation.java | 41 ++++- .../org/nd4j/linalg/factory/Environment.java | 35 +++++ 37 files changed, 844 insertions(+), 46 deletions(-) create mode 100644 libnd4j/include/memory/MemoryCounter.h create mode 100644 libnd4j/include/memory/impl/MemoryCounter.cpp create mode 100644 libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp create mode 100644 libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/blas/Environment.cpp index f423c73dd..3b9502534 100644 --- a/libnd4j/blas/Environment.cpp +++ b/libnd4j/blas/Environment.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef _OPENMP @@ -291,11 +292,19 @@ namespace nd4j { } void Environment::setMaxThreads(int max) { + // FIXME: not possible at this moment, since maxThreads is limited by number of threads in pool. however we can allocate more threads if we want //_maxThreads.store(max); } void Environment::setMaxMasterThreads(int max) { - //_maxMasterThreads = max; + if (max > maxThreads()) { + max = maxThreads(); + } + + if (max < 1) + return; + + _maxMasterThreads = max; } bool Environment::precisionBoostAllowed() { @@ -334,6 +343,38 @@ namespace nd4j { _allowHelpers.store(reallyAllow); } + void Environment::setGroupLimit(int group, Nd4jLong numBytes) { + nd4j::memory::MemoryCounter::getInstance()->setGroupLimit((nd4j::memory::MemoryType) group, numBytes); + } + + void Environment::setDeviceLimit(int deviceId, Nd4jLong numBytes) { + nd4j::memory::MemoryCounter::getInstance()->setDeviceLimit(deviceId, numBytes); + } + + Nd4jLong Environment::getGroupLimit(int group) { + return nd4j::memory::MemoryCounter::getInstance()->groupLimit((nd4j::memory::MemoryType) group); + } + + Nd4jLong Environment::getDeviceLimit(int deviceId) { + return nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId); + } + + Nd4jLong Environment::getGroupCounter(int group) { + return nd4j::memory::MemoryCounter::getInstance()->allocatedGroup((nd4j::memory::MemoryType) group); + } + + Nd4jLong Environment::getDeviceCounter(int deviceId) { + return nd4j::memory::MemoryCounter::getInstance()->allocatedDevice(deviceId); + } + + uint64_t Environment::maxPrimaryMemory() { + return _maxTotalPrimaryMemory.load(); + } + + uint64_t Environment::maxSpecialMemory() { + return _maxTotalSpecialMemory.load(); + } + nd4j::Environment *nd4j::Environment::_instance = 0; } diff --git a/libnd4j/blas/Environment.h b/libnd4j/blas/Environment.h index 54982471f..5bef3f1e4 100644 --- a/libnd4j/blas/Environment.h +++ b/libnd4j/blas/Environment.h @@ -27,6 +27,7 @@ #include #include #include +#include namespace nd4j{ class ND4J_EXPORT Environment { @@ -97,10 +98,30 @@ namespace nd4j{ int maxMasterThreads(); void setMaxMasterThreads(int max); + /* + * Legacy memory limits API, still used in new API as simplified version + */ void setMaxPrimaryMemory(uint64_t maxBytes); void setMaxSpecialyMemory(uint64_t maxBytes); void setMaxDeviceMemory(uint64_t maxBytes); + uint64_t maxPrimaryMemory(); + uint64_t maxSpecialMemory(); + //////////////////////// + + /* + * Methods for memory limits/counters + */ + void setGroupLimit(int group, Nd4jLong numBytes); + void setDeviceLimit(int deviceId, Nd4jLong numBytes); + + Nd4jLong getGroupLimit(int group); + Nd4jLong getDeviceLimit(int deviceId); + + Nd4jLong getGroupCounter(int group); + Nd4jLong getDeviceCounter(int deviceId); + //////////////////////// + bool isUseMKLDNN() { return _useMKLDNN.load(); } void setUseMKLDNN(bool useMKLDNN) { _useMKLDNN.store(useMKLDNN); } diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index 037e6e1d0..671f72a57 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -44,6 +44,7 @@ #include #include #include +#include namespace nd4j { diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h index 10893c08d..cd6274dfb 100755 --- a/libnd4j/blas/NativeOps.h +++ b/libnd4j/blas/NativeOps.h @@ -76,6 +76,7 @@ bool verbose = false; #include #include #include +#include typedef nd4j::InteropDataBuffer OpaqueDataBuffer; diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index e150a2039..1b1d22fbf 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -3093,8 +3093,14 @@ bool isOptimalRequirementsMet() { } OpaqueDataBuffer* allocateDataBuffer(Nd4jLong elements, int dataType, bool allocateBoth) { - auto dtype = DataTypeUtils::fromInt(dataType); - return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype) , dtype, allocateBoth); + try { + auto dtype = DataTypeUtils::fromInt(dataType); + return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype) , dtype, allocateBoth); + } catch (std::exception &e) { + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); + return nullptr; + } } Nd4jPointer dbPrimaryBuffer(OpaqueDataBuffer *dataBuffer) { @@ -3126,7 +3132,12 @@ void dbAllocateSpecialBuffer(OpaqueDataBuffer *dataBuffer) { } void dbExpandBuffer(OpaqueDataBuffer *dataBuffer, Nd4jLong elements) { - dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType())); + try { + dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType())); + } catch (std::exception &e) { + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); + } } OpaqueDataBuffer* dbCreateView(OpaqueDataBuffer *dataBuffer, Nd4jLong length, Nd4jLong offset) { diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 1f8149865..419cadef5 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -3781,8 +3781,14 @@ void ctxAllowHelpers(OpaqueContext* ptr, bool reallyAllow) { } OpaqueDataBuffer* allocateDataBuffer(Nd4jLong elements, int dataType, bool allocateBoth) { - auto dtype = DataTypeUtils::fromInt(dataType); - return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype) , dtype, allocateBoth); + try { + auto dtype = DataTypeUtils::fromInt(dataType); + return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype), dtype, allocateBoth); + } catch (std::exception &e) { + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); + return nullptr; + } } Nd4jPointer dbPrimaryBuffer(OpaqueDataBuffer *dataBuffer) { @@ -3814,7 +3820,12 @@ void dbAllocateSpecialBuffer(OpaqueDataBuffer *dataBuffer) { } void dbExpandBuffer(OpaqueDataBuffer *dataBuffer, Nd4jLong elements) { - dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType())); + try { + dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType())); + } catch (std::exception &e) { + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); + nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); + } } OpaqueDataBuffer* dbCreateView(OpaqueDataBuffer *dataBuffer, Nd4jLong length, Nd4jLong offset) { diff --git a/libnd4j/include/array/ConstantDescriptor.h b/libnd4j/include/array/ConstantDescriptor.h index f2f2f46a6..f32c1c8bf 100644 --- a/libnd4j/include/array/ConstantDescriptor.h +++ b/libnd4j/include/array/ConstantDescriptor.h @@ -22,7 +22,7 @@ #define DEV_TESTS_CONSTANTDESCRIPTOR_H #include -#include +#include #include #include #include diff --git a/libnd4j/include/array/NDArrayList.h b/libnd4j/include/array/NDArrayList.h index 358469602..843b69a91 100644 --- a/libnd4j/include/array/NDArrayList.h +++ b/libnd4j/include/array/NDArrayList.h @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/array/ShapeDescriptor.h b/libnd4j/include/array/ShapeDescriptor.h index 25839cfa9..ddfd45a38 100644 --- a/libnd4j/include/array/ShapeDescriptor.h +++ b/libnd4j/include/array/ShapeDescriptor.h @@ -21,7 +21,7 @@ #ifndef DEV_TESTS_SHAPEDESCRIPTOR_H #define DEV_TESTS_SHAPEDESCRIPTOR_H -#include +#include #include #include #include diff --git a/libnd4j/include/array/cuda/DataBuffer.cu b/libnd4j/include/array/cuda/DataBuffer.cu index 2a3efa3c8..7cb66ac9d 100644 --- a/libnd4j/include/array/cuda/DataBuffer.cu +++ b/libnd4j/include/array/cuda/DataBuffer.cu @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include namespace nd4j { void DataBuffer::expand(const uint64_t size) { @@ -64,8 +67,20 @@ namespace nd4j { void DataBuffer::allocateSpecial() { if (_specialBuffer == nullptr && getLenInBytes() > 0) { + auto deviceId = nd4j::AffinityManager::currentDeviceId(); + + if (_workspace == nullptr) + if (!nd4j::memory::MemoryCounter::getInstance()->validate(getLenInBytes())) + throw nd4j::allocation_exception::build("Requested amount exceeds device limits", nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes()); + + ALLOCATE_SPECIAL(_specialBuffer, _workspace, getLenInBytes(), int8_t); _isOwnerSpecial = true; + + if (_workspace == nullptr) { + nd4j::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes()); + nd4j::memory::MemoryCounter::getInstance()->countIn(nd4j::memory::MemoryType::DEVICE, getLenInBytes()); + } } } diff --git a/libnd4j/include/array/impl/DataBuffer.cpp b/libnd4j/include/array/impl/DataBuffer.cpp index fae25478f..49527026c 100644 --- a/libnd4j/include/array/impl/DataBuffer.cpp +++ b/libnd4j/include/array/impl/DataBuffer.cpp @@ -23,6 +23,8 @@ #include #include #include +#include +#include namespace nd4j { ///// IMLEMENTATION OF COMMON METHODS ///// @@ -232,14 +234,35 @@ namespace nd4j { void DataBuffer::allocatePrimary() { if (_primaryBuffer == nullptr && getLenInBytes() > 0) { + auto deviceId = nd4j::AffinityManager::currentDeviceId(); + // check if this allocation won't bring us above limit + if (_workspace == nullptr) { + if (Environment::getInstance()->isCPU()) { + // on cpu backend we validate against device 0 for now + if (!nd4j::memory::MemoryCounter::getInstance()->validate(getLenInBytes())) + throw nd4j::allocation_exception::build("Requested amount exceeds HOST device limits", nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes()); + } else { + // in heterogenous mode we valdate against device group + if (!nd4j::memory::MemoryCounter::getInstance()->validateGroup(nd4j::memory::MemoryType::HOST, getLenInBytes())) + throw nd4j::allocation_exception::build("Requested amount exceeds HOST group limits", nd4j::memory::MemoryCounter::getInstance()->groupLimit(nd4j::memory::MemoryType::HOST), getLenInBytes()); + } + } + ALLOCATE(_primaryBuffer, _workspace, getLenInBytes(), int8_t); _isOwnerPrimary = true; + + // count in towards current deviceId if we're not in workspace mode + if (_workspace == nullptr) { + if (Environment::getInstance()->isCPU()) // we don't want this counter to be added to CUDA device + nd4j::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes()); + + nd4j::memory::MemoryCounter::getInstance()->countIn(nd4j::memory::MemoryType::HOST, getLenInBytes()); + } } } //////////////////////////////////////////////////////////////////////// void DataBuffer::setAllocFlags(const bool isOwnerPrimary, const bool isOwnerSpecial) { - _isOwnerPrimary = isOwnerPrimary; _isOwnerSpecial = isOwnerSpecial; } @@ -252,6 +275,15 @@ namespace nd4j { RELEASE(p, _workspace); _primaryBuffer = nullptr; _isOwnerPrimary = false; + + + // count out towards DataBuffer device, only if we're not in workspace + if (_workspace == nullptr) { + if (Environment::getInstance()->isCPU()) + nd4j::memory::MemoryCounter::getInstance()->countOut(_deviceId, getLenInBytes()); + + nd4j::memory::MemoryCounter::getInstance()->countOut(nd4j::memory::MemoryType::HOST, getLenInBytes()); + } } } diff --git a/libnd4j/include/exceptions/allocation_exception.h b/libnd4j/include/exceptions/allocation_exception.h index 29756d253..458650037 100644 --- a/libnd4j/include/exceptions/allocation_exception.h +++ b/libnd4j/include/exceptions/allocation_exception.h @@ -40,6 +40,7 @@ namespace nd4j { ~allocation_exception() = default; static allocation_exception build(std::string message, Nd4jLong bytes); + static allocation_exception build(std::string message, Nd4jLong limit, Nd4jLong bytes); }; } diff --git a/libnd4j/include/exceptions/impl/allocation_exception.cpp b/libnd4j/include/exceptions/impl/allocation_exception.cpp index 76c6338da..85c3e72aa 100644 --- a/libnd4j/include/exceptions/impl/allocation_exception.cpp +++ b/libnd4j/include/exceptions/impl/allocation_exception.cpp @@ -31,4 +31,11 @@ namespace nd4j { message += "; Requested bytes: [" + bytes + "]"; return allocation_exception(message); } + + allocation_exception allocation_exception::build(std::string message, Nd4jLong limit, Nd4jLong numBytes) { + auto bytes = StringUtils::valueToString(numBytes); + auto lim = StringUtils::valueToString(limit); + message += "; Limit bytes: [" + lim + "]; Requested bytes: [" + bytes + "]"; + return allocation_exception(message); + } } \ No newline at end of file diff --git a/libnd4j/include/graph/ExecutionResult.h b/libnd4j/include/graph/ExecutionResult.h index 7a632a998..b1a1b1737 100644 --- a/libnd4j/include/graph/ExecutionResult.h +++ b/libnd4j/include/graph/ExecutionResult.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/graph/Graph.h b/libnd4j/include/graph/Graph.h index 18c23ec0f..00efb3c52 100644 --- a/libnd4j/include/graph/Graph.h +++ b/libnd4j/include/graph/Graph.h @@ -23,7 +23,7 @@ #include #include -#include +#include //#include #include #include diff --git a/libnd4j/include/graph/GraphHolder.h b/libnd4j/include/graph/GraphHolder.h index a60d088e7..f740ad4ca 100644 --- a/libnd4j/include/graph/GraphHolder.h +++ b/libnd4j/include/graph/GraphHolder.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/graph/GraphState.h b/libnd4j/include/graph/GraphState.h index 95f8a016d..52c6f9e16 100644 --- a/libnd4j/include/graph/GraphState.h +++ b/libnd4j/include/graph/GraphState.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/graph/Scope.h b/libnd4j/include/graph/Scope.h index 4e322b5b2..5cbbf8bc0 100644 --- a/libnd4j/include/graph/Scope.h +++ b/libnd4j/include/graph/Scope.h @@ -22,7 +22,7 @@ #define LIBND4J_SCOPE_H #include -#include +#include #include namespace nd4j { diff --git a/libnd4j/include/graph/Stash.h b/libnd4j/include/graph/Stash.h index 6613ec859..83a7ec066 100644 --- a/libnd4j/include/graph/Stash.h +++ b/libnd4j/include/graph/Stash.h @@ -23,7 +23,7 @@ //#include #include -#include +#include #include #include #include diff --git a/libnd4j/include/graph/VariableSpace.h b/libnd4j/include/graph/VariableSpace.h index 21bdc608b..9443d34b1 100644 --- a/libnd4j/include/graph/VariableSpace.h +++ b/libnd4j/include/graph/VariableSpace.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/memory/MemoryCounter.h b/libnd4j/include/memory/MemoryCounter.h new file mode 100644 index 000000000..bf8ff60dc --- /dev/null +++ b/libnd4j/include/memory/MemoryCounter.h @@ -0,0 +1,146 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SD_MEMORYCOUNTER_H +#define SD_MEMORYCOUNTER_H + +#include +#include +#include +#include +#include + +namespace nd4j { + namespace memory { + /** + * This class provides simple per-device counter + */ + class ND4J_EXPORT MemoryCounter { + private: + static MemoryCounter* _INSTANCE; + + // used for synchronization + std::mutex _locker; + + // per-device counters + std::map _deviceCounters; + + // TODO: change this wrt heterogenous stuff on next iteration + // per-group counters + std::map _groupCounters; + + // per-device limits + std::map _deviceLimits; + + // per-group limits + std::map _groupLimits; + + MemoryCounter(); + ~MemoryCounter() = default; + + public: + static MemoryCounter *getInstance(); + + /** + * This method checks if allocation of numBytes won't break through per-group or per-device limit + * @param numBytes + * @return TRUE if allocated ammount will keep us below limit, FALSE otherwise + */ + bool validate(Nd4jLong numBytes); + + /** + * This method checks if allocation of numBytes won't break through per-device limit + * @param deviceId + * @param numBytes + * @return TRUE if allocated ammount will keep us below limit, FALSE otherwise + */ + bool validateDevice(int deviceId, Nd4jLong numBytes); + + /** + * This method checks if allocation of numBytes won't break through per-group limit + * @param deviceId + * @param numBytes + * @return TRUE if allocated ammount will keep us below limit, FALSE otherwise + */ + bool validateGroup(nd4j::memory::MemoryType group, Nd4jLong numBytes); + + /** + * This method adds specified number of bytes to specified counter + * @param deviceId + * @param numBytes + */ + void countIn(int deviceId, Nd4jLong numBytes); + void countIn(nd4j::memory::MemoryType group, Nd4jLong numBytes); + + /** + * This method subtracts specified number of bytes from specified counter + * @param deviceId + * @param numBytes + */ + void countOut(int deviceId, Nd4jLong numBytes); + void countOut(nd4j::memory::MemoryType group, Nd4jLong numBytes); + + /** + * This method returns amount of memory allocated on specified device + * @param deviceId + * @return + */ + Nd4jLong allocatedDevice(int deviceId); + + /** + * This method returns amount of memory allocated in specified group of devices + * @param group + * @return + */ + Nd4jLong allocatedGroup(nd4j::memory::MemoryType group); + + /** + * This method allows to set per-device memory limits + * @param deviceId + * @param numBytes + */ + void setDeviceLimit(int deviceId, Nd4jLong numBytes); + + /** + * This method returns current device limit in bytes + * @param deviceId + * @return + */ + Nd4jLong deviceLimit(int deviceId); + + /** + * This method allows to set per-group memory limits + * @param group + * @param numBytes + */ + void setGroupLimit(nd4j::memory::MemoryType group, Nd4jLong numBytes); + + /** + * This method returns current group limit in bytes + * @param group + * @return + */ + Nd4jLong groupLimit(nd4j::memory::MemoryType group); + }; + } +} + + +#endif //SD_MEMORYCOUNTER_H diff --git a/libnd4j/include/memory/MemoryTracker.h b/libnd4j/include/memory/MemoryTracker.h index 78ade5bcc..097d2903d 100644 --- a/libnd4j/include/memory/MemoryTracker.h +++ b/libnd4j/include/memory/MemoryTracker.h @@ -30,6 +30,9 @@ namespace nd4j { namespace memory { + /** + * This class is used for tracking memory allocation wrt their allocation points in code + */ class ND4J_EXPORT MemoryTracker { private: static MemoryTracker* _INSTANCE; diff --git a/libnd4j/include/memory/impl/MemoryCounter.cpp b/libnd4j/include/memory/impl/MemoryCounter.cpp new file mode 100644 index 000000000..0dc845e37 --- /dev/null +++ b/libnd4j/include/memory/impl/MemoryCounter.cpp @@ -0,0 +1,133 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "../MemoryCounter.h" +#include +#include +#include + +namespace nd4j { + namespace memory { + + MemoryCounter::MemoryCounter() { + auto numDevices = nd4j::AffinityManager::numberOfDevices(); + + // setting default 0s + for (int e = 0; e < numDevices; e++) { + _deviceLimits[e] = 0; + _deviceCounters[e] = 0; + } + + // setting initial values for limits + _groupLimits[nd4j::memory::MemoryType::HOST] = nd4j::Environment::getInstance()->maxPrimaryMemory(); + _groupLimits[nd4j::memory::MemoryType::DEVICE] = nd4j::Environment::getInstance()->maxSpecialMemory(); + + // setting initial counter values + _groupCounters[nd4j::memory::MemoryType::HOST] = 0; + _groupCounters[nd4j::memory::MemoryType::DEVICE] = 0; + } + + MemoryCounter* MemoryCounter::getInstance() { + if (_INSTANCE == 0) + _INSTANCE = new MemoryCounter(); + + return _INSTANCE; + } + + void MemoryCounter::countIn(int deviceId, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _deviceCounters[deviceId] += numBytes; + } + + void MemoryCounter::countIn(nd4j::memory::MemoryType group, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _groupCounters[group] += numBytes; + } + + void MemoryCounter::countOut(int deviceId, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _deviceCounters[deviceId] -= numBytes; + } + + void MemoryCounter::countOut(nd4j::memory::MemoryType group, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _groupCounters[group] -= numBytes; + } + + bool MemoryCounter::validate(Nd4jLong numBytes) { + auto deviceId = nd4j::AffinityManager::currentDeviceId(); + return validateDevice(deviceId, numBytes); + } + + bool MemoryCounter::validateDevice(int deviceId, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + auto dLimit = _deviceLimits[deviceId]; + if (dLimit <= 0) + return true; + + auto dAlloc = _deviceCounters[deviceId]; + + return numBytes + dAlloc <= dLimit; + } + + bool MemoryCounter::validateGroup(nd4j::memory::MemoryType group, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + auto gLimit = _groupLimits[group]; + if (gLimit <= 0) + return true; + + auto gAlloc = _groupCounters[group]; + + return numBytes + gAlloc <= gLimit; + } + + Nd4jLong MemoryCounter::allocatedDevice(int deviceId) { + std::lock_guard lock(_locker); + return _deviceCounters[deviceId]; + } + + Nd4jLong MemoryCounter::allocatedGroup(nd4j::memory::MemoryType group) { + std::lock_guard lock(_locker); + return _groupCounters[group]; + } + + void MemoryCounter::setDeviceLimit(int deviceId, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _deviceLimits[deviceId] = numBytes; + } + + void MemoryCounter::setGroupLimit(nd4j::memory::MemoryType group, Nd4jLong numBytes) { + std::lock_guard lock(_locker); + _groupLimits[group] = numBytes; + } + + Nd4jLong MemoryCounter::deviceLimit(int deviceId) { + std::lock_guard lock(_locker); + return _deviceLimits[deviceId]; + } + + Nd4jLong MemoryCounter::groupLimit(nd4j::memory::MemoryType group) { + std::lock_guard lock(_locker); + return _groupLimits[group]; + } + + MemoryCounter* MemoryCounter::_INSTANCE = 0; + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/OpRegistrator.h b/libnd4j/include/ops/declarable/OpRegistrator.h index ccec8a3c6..789b361f3 100644 --- a/libnd4j/include/ops/declarable/OpRegistrator.h +++ b/libnd4j/include/ops/declarable/OpRegistrator.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp index 5e21e3b8e..e20145735 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace nd4j { namespace ops { diff --git a/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp b/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp new file mode 100644 index 000000000..e6d270141 --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp @@ -0,0 +1,78 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nd4j; +using namespace nd4j::graph; +using namespace nd4j::memory; + +class DataBufferTests : public testing::Test { +public: + +}; + +TEST_F(DataBufferTests, test_alloc_limit_1) { + if (!Environment::getInstance()->isCPU()) + return; + + auto deviceId = AffinityManager::currentDeviceId(); + auto odLimit = MemoryCounter::getInstance()->deviceLimit(deviceId); + auto ogLimit = MemoryCounter::getInstance()->groupLimit(MemoryType::HOST); + auto odUse = MemoryCounter::getInstance()->allocatedDevice(deviceId); + auto ogUse = MemoryCounter::getInstance()->allocatedGroup(MemoryType::HOST); + + auto limitSize = 150 * 1024 * 1024; + auto allocSize = 100000000; + + MemoryCounter::getInstance()->setDeviceLimit(deviceId, odLimit + limitSize); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::HOST, odLimit + limitSize); + + DataBuffer buffer(allocSize, DataType::INT32); + + // separately testing per-device limits and group limits + ASSERT_EQ(odUse + allocSize, MemoryCounter::getInstance()->allocatedDevice(deviceId)); + ASSERT_EQ(ogUse + allocSize, MemoryCounter::getInstance()->allocatedGroup(MemoryType::HOST)); + + + // setting smaller limits, to make sure next allocation fails with OOM exception + MemoryCounter::getInstance()->setDeviceLimit(deviceId, allocSize - 100); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::HOST, allocSize - 100); + + try { + DataBuffer bufferFailed(allocSize, DataType::INT32); + ASSERT_TRUE(false); + } catch (allocation_exception &e) { + // we expect exception here + } + + // restore original limits, so subsequent tests do not fail + MemoryCounter::getInstance()->setDeviceLimit(deviceId, odLimit); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::HOST, odLimit); +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu b/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu new file mode 100644 index 000000000..355633a4e --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu @@ -0,0 +1,87 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nd4j; +using namespace nd4j::graph; +using namespace nd4j::memory; + +class DataBufferTestsCuda : public testing::Test { +public: + +}; + +TEST_F(DataBufferTestsCuda, test_alloc_limit_1) { + auto deviceId = AffinityManager::currentDeviceId(); + + auto odLimit = MemoryCounter::getInstance()->deviceLimit(deviceId); + + auto opLimit = MemoryCounter::getInstance()->groupLimit(MemoryType::HOST); + auto osLimit = MemoryCounter::getInstance()->groupLimit(MemoryType::DEVICE); + + auto odUse = MemoryCounter::getInstance()->allocatedDevice(deviceId); + + auto opUse = MemoryCounter::getInstance()->allocatedGroup(MemoryType::HOST); + auto osUse = MemoryCounter::getInstance()->allocatedGroup(MemoryType::DEVICE); + + auto limitSize = 150000000; + auto allocSize = 100000000; + + MemoryCounter::getInstance()->setDeviceLimit(deviceId, odLimit + limitSize); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::HOST, opLimit + limitSize); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::DEVICE, osLimit + limitSize); + + DataBuffer buffer(allocSize, DataType::INT32, nullptr, true); + + // separately testing per-device limits and group limits + ASSERT_EQ(odUse + allocSize, MemoryCounter::getInstance()->allocatedDevice(deviceId)); + ASSERT_EQ(opUse + allocSize, MemoryCounter::getInstance()->allocatedGroup(MemoryType::HOST)); + ASSERT_EQ(osUse + allocSize, MemoryCounter::getInstance()->allocatedGroup(MemoryType::DEVICE)); + + // setting smaller limits, to make sure next allocation fails with OOM exception + MemoryCounter::getInstance()->setDeviceLimit(deviceId, allocSize - 100); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::DEVICE, allocSize - 100); + + + // this allocation should fail, since we're allocating too much + try { + DataBuffer bufferFailed(allocSize, DataType::INT32); + ASSERT_TRUE(false); + } catch (allocation_exception &e) { + // we expect exception here + } + + // + + // restore original limits, so subsequent tests do not fail + MemoryCounter::getInstance()->setDeviceLimit(deviceId, odLimit); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::HOST, opLimit); + MemoryCounter::getInstance()->setGroupLimit(MemoryType::DEVICE, osLimit); +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp index b646a493d..e87dfa125 100644 --- a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp @@ -29,8 +29,6 @@ #include #include -using namespace nd4j; - using namespace nd4j; using namespace nd4j::graph; diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/pairwise/arithmetic/RSubOp.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/pairwise/arithmetic/RSubOp.java index f6cf14bd4..b0090d047 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/pairwise/arithmetic/RSubOp.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/pairwise/arithmetic/RSubOp.java @@ -63,7 +63,7 @@ public class RSubOp extends BaseDynamicTransformOp { @Override public String tensorflowName() { - return "Sub"; + throw new NoOpNameFoundException("No TensorFlow op name found for: " + getClass().getName()); } public RSubOp( INDArray[] inputs, INDArray[] outputs) { diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/OpaqueDataBuffer.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/OpaqueDataBuffer.java index 3fb5d2748..d5a84eac3 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/OpaqueDataBuffer.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/OpaqueDataBuffer.java @@ -18,18 +18,23 @@ package org.nd4j.nativeblas; import lombok.NonNull; +import lombok.extern.slf4j.Slf4j; import lombok.val; import org.bytedeco.javacpp.Pointer; import org.nd4j.linalg.api.buffer.DataType; +import java.util.concurrent.locks.LockSupport; + /** * This class is a opaque pointer to InteropDataBuffer, used for Java/C++ interop related to INDArray DataBuffer * * @author saudet + * @author raver119@gmail.com */ +@Slf4j public class OpaqueDataBuffer extends Pointer { // TODO: make this configurable - private static final int MAX_TRIES = 3; + private static final int MAX_TRIES = 5; public OpaqueDataBuffer(Pointer p) { super(p); } @@ -53,11 +58,13 @@ public class OpaqueDataBuffer extends Pointer { // check error code ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode(); if (ec != 0) { - if (em == null) - em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); + em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); // if allocation failed it might be caused by casual OOM, so we'll try GC System.gc(); + + // sleeping for 50ms + Thread.sleep(50); } else { // just return the buffer return buffer; @@ -89,11 +96,12 @@ public class OpaqueDataBuffer extends Pointer { // check error code ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode(); if (ec != 0) { - if (em == null) - em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); + em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); // if expansion failed it might be caused by casual OOM, so we'll try GC System.gc(); + + Thread.sleep(50); } else { // just return return; @@ -126,11 +134,13 @@ public class OpaqueDataBuffer extends Pointer { // check error code ec = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode(); if (ec != 0) { - if (em == null) - em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); + em = NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorMessage(); // if view creation failed it might be caused by casual OOM, so we'll try GC System.gc(); + + // sleeping to let gc kick in + Thread.sleep(50); } else { // just return return buffer; diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/CudaEnvironment.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/CudaEnvironment.java index 83e9c156f..16abeef9a 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/CudaEnvironment.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/CudaEnvironment.java @@ -167,4 +167,29 @@ public class CudaEnvironment implements Environment { public boolean isCPU() { return e.isCPU(); } + + @Override + public void setGroupLimit(int group, long numBytes) { + e.setGroupLimit(group, numBytes); + } + + @Override + public void setDeviceLimit(int deviceId, long numBytes) { + e.setDeviceLimit(deviceId, numBytes); + } + + @Override + public long getGroupLimit(int group) { + return e.getGroupLimit(group); + } + + @Override + public long getDeviceLimit(int deviceId) { + return e.getDeviceLimit(deviceId); + } + + @Override + public long getDeviceCouner(int deviceId) { + return e.getDeviceCounter(deviceId); + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index b98630dc4..5aa685c7a 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -495,7 +495,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper { // #define DEV_TESTS_CONSTANTDESCRIPTOR_H // #include -// #include +// #include // #include // #include // #include @@ -808,6 +808,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper { // #include // #include // #include +// #include @Namespace("nd4j") @NoOffset public static class Environment extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -846,10 +847,30 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper { public native int maxMasterThreads(); public native void setMaxMasterThreads(int max); + /* + * Legacy memory limits API, still used in new API as simplified version + */ public native void setMaxPrimaryMemory(@Cast("uint64_t") long maxBytes); public native void setMaxSpecialyMemory(@Cast("uint64_t") long maxBytes); public native void setMaxDeviceMemory(@Cast("uint64_t") long maxBytes); + public native @Cast("uint64_t") long maxPrimaryMemory(); + public native @Cast("uint64_t") long maxSpecialMemory(); + //////////////////////// + + /* + * Methods for memory limits/counters + */ + public native void setGroupLimit(int group, @Cast("Nd4jLong") long numBytes); + public native void setDeviceLimit(int deviceId, @Cast("Nd4jLong") long numBytes); + + public native @Cast("Nd4jLong") long getGroupLimit(int group); + public native @Cast("Nd4jLong") long getDeviceLimit(int deviceId); + + public native @Cast("Nd4jLong") long getGroupCounter(int group); + public native @Cast("Nd4jLong") long getDeviceCounter(int deviceId); + //////////////////////// + public native @Cast("bool") boolean isUseMKLDNN(); public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN); @@ -1017,6 +1038,7 @@ bool verbose = false; // #include // #include // #include +// #include /** * This function returns last error code stored, @@ -3591,6 +3613,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @@ -4856,7 +4879,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include -// #include +// #include // #include // #include // #include @@ -5007,6 +5030,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include // #ifdef __CUDACC__ // #endif @@ -5458,7 +5482,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); //#include // #include -// #include +// #include // #include // #include // #include @@ -5549,7 +5573,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include -// #include +// #include // #include // #include // #include @@ -5665,7 +5689,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include -// #include +// #include // #include // #include // #include @@ -9674,7 +9698,7 @@ public static final int PREALLOC_SIZE = 33554432; // #include // #include -// #include +// #include // #include // #include // #include @@ -9939,7 +9963,7 @@ public static final int PREALLOC_SIZE = 33554432; // #ifndef DEV_TESTS_SHAPEDESCRIPTOR_H // #define DEV_TESTS_SHAPEDESCRIPTOR_H -// #include +// #include // #include // #include // #include diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuEnvironment.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuEnvironment.java index a97b52d9f..363e8857b 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuEnvironment.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuEnvironment.java @@ -167,4 +167,29 @@ public class CpuEnvironment implements Environment { public boolean isCPU() { return e.isCPU(); } + + @Override + public void setGroupLimit(int group, long numBytes) { + e.setGroupLimit(group, numBytes); + } + + @Override + public void setDeviceLimit(int deviceId, long numBytes) { + e.setDeviceLimit(deviceId, numBytes); + } + + @Override + public long getGroupLimit(int group) { + return e.getGroupLimit(group); + } + + @Override + public long getDeviceLimit(int deviceId) { + return e.getDeviceLimit(deviceId); + } + + @Override + public long getDeviceCouner(int deviceId) { + return e.getDeviceCounter(deviceId); + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index 784386cc8..cfabc651c 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -573,7 +573,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper { // #define DEV_TESTS_CONSTANTDESCRIPTOR_H // #include -// #include +// #include // #include // #include // #include @@ -811,6 +811,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper { // #include // #include // #include +// #include @Namespace("nd4j") @NoOffset public static class Environment extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -849,10 +850,30 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper { public native int maxMasterThreads(); public native void setMaxMasterThreads(int max); + /* + * Legacy memory limits API, still used in new API as simplified version + */ public native void setMaxPrimaryMemory(@Cast("uint64_t") long maxBytes); public native void setMaxSpecialyMemory(@Cast("uint64_t") long maxBytes); public native void setMaxDeviceMemory(@Cast("uint64_t") long maxBytes); + public native @Cast("uint64_t") long maxPrimaryMemory(); + public native @Cast("uint64_t") long maxSpecialMemory(); + //////////////////////// + + /* + * Methods for memory limits/counters + */ + public native void setGroupLimit(int group, @Cast("Nd4jLong") long numBytes); + public native void setDeviceLimit(int deviceId, @Cast("Nd4jLong") long numBytes); + + public native @Cast("Nd4jLong") long getGroupLimit(int group); + public native @Cast("Nd4jLong") long getDeviceLimit(int deviceId); + + public native @Cast("Nd4jLong") long getGroupCounter(int group); + public native @Cast("Nd4jLong") long getDeviceCounter(int deviceId); + //////////////////////// + public native @Cast("bool") boolean isUseMKLDNN(); public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN); @@ -1020,6 +1041,7 @@ bool verbose = false; // #include // #include // #include +// #include /** * This function returns last error code stored, @@ -3594,6 +3616,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @@ -4859,7 +4882,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include -// #include +// #include // #include // #include // #include @@ -5010,6 +5033,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include // #ifdef __CUDACC__ // #endif @@ -5461,7 +5485,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); //#include // #include -// #include +// #include // #include // #include // #include @@ -5552,7 +5576,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include -// #include +// #include // #include // #include // #include @@ -5668,7 +5692,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include -// #include +// #include // #include // #include // #include @@ -11885,7 +11909,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #include // #include -// #include +// #include // #include // #include // #include @@ -17106,6 +17130,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); * Input : batched tensor with rank >=2 * Output: tensor with rank lesser by 1 from input */ +// #if NOT_EXCLUDED(OP_matrix_diag_part) @Namespace("nd4j::ops") public static class matrix_diag_part extends DeclarableCustomOp { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -17121,7 +17146,36 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); } +// #endif + /** + * QR decomposition: A = QR, where Q is ortogonal (Q * QT = I) and R is upper triangular. + * For A (MxN) Q is M x M and R is (NxN). + * + * Input : + * 0 - float (or complex float) tensor with shape {.,..,...,M,N} - batch of float matricies + * + * Output: + * 0 - float tensor with shape {.,..,...,MxN} - batch of ortogonal matricies {Qs} + * 1 - float tensor with shape {.,..,...,NxN} - batch of upper triangular matricies {Rs} + */ +// #if NOT_EXCLUDED(OP_qr) + @Namespace("nd4j::ops") public static class qr extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public qr(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public qr(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public qr position(long position) { + return (qr)super.position(position); + } + + public qr() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } +// #endif /** * This operation takes 2 arrays: original values, and values to be excluded. And returns 2 arrays: values left after exclusion, and indices in original array for surivals. @@ -23687,7 +23741,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #ifndef DEV_TESTS_SHAPEDESCRIPTOR_H // #define DEV_TESTS_SHAPEDESCRIPTOR_H -// #include +// #include // #include // #include // #include diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java index c86246a52..ce1ad388c 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java @@ -459,7 +459,7 @@ public class Nd4jTestsC extends BaseNd4jTest { @Test public void testSubiRowVector() { INDArray oneThroughFour = Nd4j.linspace(1, 4, 4, DataType.DOUBLE).reshape('c', 2, 2); - INDArray row1 = oneThroughFour.getRow(1); + INDArray row1 = oneThroughFour.getRow(1).dup(); oneThroughFour.subiRowVector(row1); INDArray result = Nd4j.create(new double[] {-2, -2, 0, 0}, new long[] {2, 2}); assertEquals(getFailureMessage(), result, oneThroughFour); diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/api/TestNDArrayCreation.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/api/TestNDArrayCreation.java index cea64a90b..85126b306 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/api/TestNDArrayCreation.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/api/TestNDArrayCreation.java @@ -17,6 +17,7 @@ package org.nd4j.linalg.api; import lombok.extern.slf4j.Slf4j; +import lombok.val; import org.bytedeco.javacpp.FloatPointer; import org.bytedeco.javacpp.Pointer; import org.junit.Ignore; @@ -32,7 +33,7 @@ import org.nd4j.nativeblas.NativeOpsHolder; import java.util.Map; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.*; /** * Created by Alex on 30/04/2016. @@ -109,6 +110,44 @@ public class TestNDArrayCreation extends BaseNd4jTest { assertEquals(arrCreate.data().address(), pointer.address()); } + @Test + @Ignore // this is endless test + public void testEndlessAllocation() { + Nd4j.getEnvironment().setMaxSpecialMemory(1); + while (true) { + val arr = Nd4j.createUninitialized(DataType.FLOAT, 100000000); + arr.assign(1.0f); + } + } + + @Test + @Ignore("This test is designed to run in isolation. With parallel gc it makes no real sense since allocated amount changes at any time") + public void testAllocationLimits() throws Exception { + Nd4j.create(1); + + val origDeviceLimit = Nd4j.getEnvironment().getDeviceLimit(0); + val origDeviceCount = Nd4j.getEnvironment().getDeviceCouner(0); + + val limit = origDeviceCount + 10000; + + Nd4j.getEnvironment().setDeviceLimit(0, limit); + + val array = Nd4j.createUninitialized(DataType.DOUBLE, 1024); + assertNotNull(array); + + try { + Nd4j.createUninitialized(DataType.DOUBLE, 1024); + assertTrue(false); + } catch (Exception e) { + // + } + + // we want to be sure there's nothing left after exception + assertEquals(0, NativeOpsHolder.getInstance().getDeviceNativeOps().lastErrorCode()); + + Nd4j.getEnvironment().setDeviceLimit(0, origDeviceLimit); + + } @Override public char ordering() { diff --git a/nd4j/nd4j-context/src/main/java/org/nd4j/linalg/factory/Environment.java b/nd4j/nd4j-context/src/main/java/org/nd4j/linalg/factory/Environment.java index ce8e7e9f6..1b788220a 100644 --- a/nd4j/nd4j-context/src/main/java/org/nd4j/linalg/factory/Environment.java +++ b/nd4j/nd4j-context/src/main/java/org/nd4j/linalg/factory/Environment.java @@ -82,4 +82,39 @@ public interface Environment { /** Return true if the backend is a CPU backend, or false otherwise */ boolean isCPU(); + + /** + * This method allows to set memory limit for a specific group of devices. I.e. CUDA or CPU + * @param group + * @param numBytes + */ + void setGroupLimit(int group, long numBytes); + + /** + * This method allows to set memory limit for a specific device. I.e. GPU_0 + * @param deviceId + * @param numBytes + */ + void setDeviceLimit(int deviceId, long numBytes); + + /** + * This method returns current group limit + * @param group + * @return + */ + long getGroupLimit(int group); + + /** + * This method returns current device limit + * @param deviceId + * @return + */ + long getDeviceLimit(int deviceId); + + /** + * This method returns current allocated amount for a specific device. I.e. GPU_0 + * @param deviceId + * @return + */ + long getDeviceCouner(int deviceId); }