cavis/libnd4j/include/memory/impl/MemoryCounter.cpp

133 lines
4.7 KiB
C++
Raw Normal View History

[WIP] Memory limits (#167) * initial commit Signed-off-by: raver119 <raver119@gmail.com> * one more initial commit Signed-off-by: raver119 <raver119@gmail.com> * additional initial commit Signed-off-by: raver119 <raver119@gmail.com> * subsequent initial commit Signed-off-by: raver119 <raver119@gmail.com> * initial commit testing Signed-off-by: raver119 <raver119@gmail.com> * initial commit per device Signed-off-by: raver119 <raver119@gmail.com> * initial commit per group Signed-off-by: raver119 <raver119@gmail.com> * initial commit for cuda Signed-off-by: raver119 <raver119@gmail.com> * initial commit for cuda + few missed lines Signed-off-by: raver119 <raver119@gmail.com> * initial commit for cuda + missed includes Signed-off-by: raver119 <raver119@gmail.com> * initial commit for cuda + one more missed include Signed-off-by: raver119 <raver119@gmail.com> * initial commit shouldn't count host mem as dev0 in cuda Signed-off-by: raver119 <raver119@gmail.com> * initial commit that tracks HOST group limits for CUDA Signed-off-by: raver119 <raver119@gmail.com> * initial commit with some Environment changes Signed-off-by: raver119 <raver119@gmail.com> * initial commit with more Environment changes Signed-off-by: raver119 <raver119@gmail.com> * initial commit with maxMasterThreads fix Signed-off-by: raver119 <raver119@gmail.com> * initial commit with maxMasterThreads fix Signed-off-by: raver119 <raver119@gmail.com> * initial commit without maxMasterThreads exception Signed-off-by: raver119 <raver119@gmail.com> * initial commit without Nd4jULong in Environment Signed-off-by: raver119 <raver119@gmail.com> * add sleep and more iterations for OOM cases Signed-off-by: raver119 <raver119@gmail.com> * limits propagation from java side Signed-off-by: raver119 <raver119@gmail.com> * - consume ErrorCode every time - one test for memory limits Signed-off-by: raver119 <raver119@gmail.com> * unordered_map Signed-off-by: raver119 <raver119@gmail.com> * unordered_map Signed-off-by: raver119 <raver119@gmail.com> * unordered_map Signed-off-by: raver119 <raver119@gmail.com> * RSub op mapping fixed Signed-off-by: raver119 <raver119@gmail.com> * typo fixed Signed-off-by: raver119 <raver119@gmail.com> * one bad test fixed Signed-off-by: raver119 <raver119@gmail.com>
2020-01-24 08:11:09 +01:00
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include "../MemoryCounter.h"
#include <execution/AffinityManager.h>
#include <Environment.h>
#include <helpers/logger.h>
namespace nd4j {
namespace memory {
MemoryCounter::MemoryCounter() {
auto numDevices = nd4j::AffinityManager::numberOfDevices();
// setting default 0s
for (int e = 0; e < numDevices; e++) {
_deviceLimits[e] = 0;
_deviceCounters[e] = 0;
}
// setting initial values for limits
_groupLimits[nd4j::memory::MemoryType::HOST] = nd4j::Environment::getInstance()->maxPrimaryMemory();
_groupLimits[nd4j::memory::MemoryType::DEVICE] = nd4j::Environment::getInstance()->maxSpecialMemory();
// setting initial counter values
_groupCounters[nd4j::memory::MemoryType::HOST] = 0;
_groupCounters[nd4j::memory::MemoryType::DEVICE] = 0;
}
MemoryCounter* MemoryCounter::getInstance() {
if (_INSTANCE == 0)
_INSTANCE = new MemoryCounter();
return _INSTANCE;
}
void MemoryCounter::countIn(int deviceId, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_deviceCounters[deviceId] += numBytes;
}
void MemoryCounter::countIn(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_groupCounters[group] += numBytes;
}
void MemoryCounter::countOut(int deviceId, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_deviceCounters[deviceId] -= numBytes;
}
void MemoryCounter::countOut(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_groupCounters[group] -= numBytes;
}
bool MemoryCounter::validate(Nd4jLong numBytes) {
auto deviceId = nd4j::AffinityManager::currentDeviceId();
return validateDevice(deviceId, numBytes);
}
bool MemoryCounter::validateDevice(int deviceId, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
auto dLimit = _deviceLimits[deviceId];
if (dLimit <= 0)
return true;
auto dAlloc = _deviceCounters[deviceId];
return numBytes + dAlloc <= dLimit;
}
bool MemoryCounter::validateGroup(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
auto gLimit = _groupLimits[group];
if (gLimit <= 0)
return true;
auto gAlloc = _groupCounters[group];
return numBytes + gAlloc <= gLimit;
}
Nd4jLong MemoryCounter::allocatedDevice(int deviceId) {
std::lock_guard<std::mutex> lock(_locker);
return _deviceCounters[deviceId];
}
Nd4jLong MemoryCounter::allocatedGroup(nd4j::memory::MemoryType group) {
std::lock_guard<std::mutex> lock(_locker);
return _groupCounters[group];
}
void MemoryCounter::setDeviceLimit(int deviceId, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_deviceLimits[deviceId] = numBytes;
}
void MemoryCounter::setGroupLimit(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
std::lock_guard<std::mutex> lock(_locker);
_groupLimits[group] = numBytes;
}
Nd4jLong MemoryCounter::deviceLimit(int deviceId) {
std::lock_guard<std::mutex> lock(_locker);
return _deviceLimits[deviceId];
}
Nd4jLong MemoryCounter::groupLimit(nd4j::memory::MemoryType group) {
std::lock_guard<std::mutex> lock(_locker);
return _groupLimits[group];
}
MemoryCounter* MemoryCounter::_INSTANCE = 0;
}
}