Add libnd4j benchmarks (#3)

This PR adds 2 libnd4j benchmarking suits
master
Alex Black 2019-07-12 15:21:15 +10:00 committed by AlexDBlack
parent 62c6a73f9d
commit cb6654bebb
36 changed files with 3473 additions and 145 deletions

View File

@ -247,6 +247,7 @@ if(CUDA_BLAS)
endif()
endif()
file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h)
file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h)
file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/impl/*.cpp ../include/execution/*.cu ../include/execution/*.h)
file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h)
@ -267,7 +268,7 @@ if(CUDA_BLAS)
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES})
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
else()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_TESTS=true")
@ -276,7 +277,7 @@ if(CUDA_BLAS)
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES})
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
endif()
@ -300,6 +301,7 @@ elseif(CPU_BLAS)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
endif()
file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h)
file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h)
file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/*.cpp ../include/execution/*.h)
file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h)
@ -320,7 +322,7 @@ elseif(CPU_BLAS)
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES}
${OPS_SOURCES})
${OPS_SOURCES} ${PERF_SOURCES})
if(IOS)
add_library(${LIBND4J_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>)
else()

View File

@ -759,6 +759,13 @@ public:
*/
int getDeviceMajor(int deviceId);
/**
* This method returns amount of cached memory
* @param deviceId
* @return
*/
Nd4jLong getCachedMemory(int deviceId);
/**
*
* @param ptrToDeviceId
@ -1653,6 +1660,7 @@ public:
int unregisterGraph(Nd4jPointer *extraPointers, Nd4jLong graphId);
void deleteCharArray(Nd4jPointer pointer);
void deleteIntArray(Nd4jPointer pointer);
void deleteLongArray(Nd4jPointer pointer);
void deletePointerArray(Nd4jPointer pointer);
@ -1690,6 +1698,10 @@ public:
nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, Nd4jLong *data, int length);
nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, double *data, int length);
nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor);
const char* runLightBenchmarkSuit(bool printOut);
const char* runFullBenchmarkSuit(bool printOut);
};

View File

@ -204,6 +204,9 @@ template void NDArrayFactory::memcpyFromVector(void *ptr, const std::vector<int8
template NDArray* NDArrayFactory::create_(const bool scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const int8_t scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const uint8_t scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const uint16_t scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const uint32_t scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const uint64_t scalar, nd4j::LaunchContext * context);
template NDArray* NDArrayFactory::create_(const int16_t scalar, nd4j::LaunchContext * context);
template <typename T>

View File

@ -72,6 +72,9 @@ bool experimentalSupport = false;
#include <graph/ResultWrapper.h>
#include <helpers/DebugHelper.h>
#include <helpers/ConstantTadHelper.h>
#include <performance/benchmarking/BenchmarkSuit.h>
#include <performance/benchmarking/FullBenchmarkSuit.h>
#include <performance/benchmarking/LightBenchmarkSuit.h>
using namespace nd4j;
@ -2304,6 +2307,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) {
delete[] ptr;
}
void NativeOps::deleteCharArray(Nd4jPointer pointer) {
auto ptr = reinterpret_cast<char *>(pointer);
delete[] ptr;
}
void NativeOps::deleteIntArray(Nd4jPointer pointer) {
auto ptr = reinterpret_cast<int *>(pointer);
delete[] ptr;
@ -2792,6 +2800,38 @@ void NativeOps::sortTadByValue(Nd4jPointer *extraPointers,
BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortTadByValue(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
}
const char* NativeOps::runLightBenchmarkSuit(bool printOut) {
nd4j::LightBenchmarkSuit suit;
auto result = suit.runSuit();
if (printOut)
nd4j_printf("%s\n", result.data());
auto chars = new char[result.length()+1];
std::memcpy(chars, result.data(), result.length());
chars[result.length()] = (char) 0x0;
return chars;
}
Nd4jLong NativeOps::getCachedMemory(int deviceId) {
return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
}
const char* NativeOps::runFullBenchmarkSuit(bool printOut) {
nd4j::FullBenchmarkSuit suit;
auto result = suit.runSuit();
if (printOut)
nd4j_printf("%s\n", result.data());
auto chars = new char[result.length()+1];
std::memcpy(chars, result.data(), result.length());
chars[result.length()] = (char) 0x0;
return chars;
}
BUILD_SINGLE_TEMPLATE(template void flattenGeneric,(Nd4jPointer*, int, char, void*, Nd4jLong*, void*, Nd4jLong*), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void pullRowsGeneric, (void *, Nd4jLong*, void*, Nd4jLong*, const int, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*), LIBND4J_TYPES);

View File

@ -47,6 +47,8 @@
using namespace nd4j;
#include <loops/special_kernels.h>
#include <performance/benchmarking/FullBenchmarkSuit.h>
#include <performance/benchmarking/LightBenchmarkSuit.h>
cudaDeviceProp *deviceProperties;
cudaFuncAttributes *funcAttributes = new cudaFuncAttributes[64];
@ -2804,6 +2806,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) {
delete[] ptr;
}
void NativeOps::deleteCharArray(Nd4jPointer pointer) {
auto ptr = reinterpret_cast<char *>(pointer);
delete[] ptr;
}
void NativeOps::deleteIntArray(Nd4jPointer pointer) {
auto ptr = reinterpret_cast<int *>(pointer);
delete[] ptr;
@ -3289,3 +3296,35 @@ Nd4jPointer NativeOps::shapeBufferForNumpy(Nd4jPointer npyArray) {
}
return reinterpret_cast<Nd4jPointer>(nd4j::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer, true));
}
const char* NativeOps::runLightBenchmarkSuit(bool printOut) {
nd4j::LightBenchmarkSuit suit;
auto result = suit.runSuit();
if (printOut)
nd4j_printf("%s\n", result.data());
auto chars = new char[result.length()+1];
std::memcpy(chars, result.data(), result.length());
chars[result.length()] = (char) 0x0;
return chars;
}
const char* NativeOps::runFullBenchmarkSuit(bool printOut) {
nd4j::FullBenchmarkSuit suit;
auto result = suit.runSuit();
if (printOut)
nd4j_printf("%s\n", result.data());
auto chars = new char[result.length()+1];
std::memcpy(chars, result.data(), result.length());
chars[result.length()] = (char) 0x0;
return chars;
}
Nd4jLong NativeOps::getCachedMemory(int deviceId) {
return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
}

View File

@ -50,7 +50,7 @@ namespace nd4j {
unsigned int _rIterations;
protected:
void benchmarkOperation(OpBenchmark &benchmark);
std::string benchmarkOperation(OpBenchmark &benchmark);
void benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z);
@ -58,34 +58,30 @@ namespace nd4j {
void benchmarkGEMM(char orderA, std::initializer_list<Nd4jLong> shapeA, char orderB, std::initializer_list<Nd4jLong> shapeB, char orderC, std::initializer_list<Nd4jLong> shapeC);
void printHeader();
std::string printHeader();
public:
BenchmarkHelper(unsigned int warmUpIterations = 10, unsigned int runIterations = 100);
void runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg = nullptr);
void runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg = nullptr);
std::string runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg = nullptr);
std::string runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg = nullptr);
std::string runOperationSuit(OpBenchmark* benchmark);
void runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
void runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
void runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
void runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
void runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
std::string runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
std::string runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
std::string runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
void runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
std::string runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
void runScalarSuit();
void runAllSuits();
std::string runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
};
}

View File

@ -44,6 +44,8 @@ namespace nd4j {
std::vector<Nd4jPointer> _devicePointers;
std::vector<Nd4jLong> _deviceOffsets;
std::mutex _mutex;
std::vector<Nd4jLong> _counters;
public:
~ConstantHelper() = default;
@ -53,6 +55,8 @@ namespace nd4j {
void* replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace = nullptr);
ConstantDataBuffer* constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType);
Nd4jLong getCachedAmount(int deviceId);
};
}

View File

@ -36,7 +36,7 @@ namespace nd4j {
nd4j::graph::Context *_context = nullptr;
public:
DeclarableBenchmark(nd4j::ops::DeclarableOp &op, std::string name = 0) : OpBenchmark() {
_op = ops::OpRegistrator::getInstance()->getOperation(op.getOpHash());
_op = &op; //ops::OpRegistrator::getInstance()->getOperation(op.getOpHash());
_testName = name;
}

View File

@ -30,9 +30,11 @@ namespace nd4j {
ConstantHelper::ConstantHelper() {
int numDevices = getNumberOfDevices();
_cache.resize(numDevices);
_counters.resize(numDevices);
for (int e = 0; e < numDevices; e++) {
std::map<ConstantDescriptor, ConstantHolder> map;
_cache[e] = map;
_counters[e] = 0L;
}
}
@ -44,8 +46,14 @@ namespace nd4j {
}
void* ConstantHelper::replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace) {
if (workspace == nullptr) {
auto deviceId = getCurrentDevice();
_counters[deviceId] += numBytes;
}
int8_t *ptr = nullptr;
ALLOCATE(ptr, workspace, numBytes, int8_t);
std::memcpy(ptr, src, numBytes);
return ptr;
}
@ -71,7 +79,9 @@ namespace nd4j {
if (holder->hasBuffer(dataType))
return holder->getConstantDataBuffer(dataType);
else {
int8_t *cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)];
auto size = descriptor.length() * DataTypeUtils::sizeOf(dataType);
auto cbuff = new int8_t[size];
_counters[deviceId] += size;
// create buffer with this dtype
if (descriptor.isFloat()) {
@ -87,6 +97,14 @@ namespace nd4j {
}
}
Nd4jLong ConstantHelper::getCachedAmount(int deviceId) {
int numDevices = getNumberOfDevices();
if (deviceId > numDevices || deviceId < 0)
return 0L;
else
return _counters[deviceId];
}
nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
}

View File

@ -70,6 +70,7 @@ namespace nd4j {
_devicePointers.resize(numDevices);
_deviceOffsets.resize(numDevices);
_cache.resize(numDevices);
_counters.resize(numDevices);
// filling all pointers
for (int e = 0; e < numDevices; e++) {
@ -83,6 +84,7 @@ namespace nd4j {
_devicePointers[e] = constant;
_deviceOffsets[e] = 0;
_cache[e] = devCache;
_counters[e] = 0L;
}
//
@ -115,6 +117,7 @@ namespace nd4j {
constantPtr = _devicePointers[deviceId];
constantOffset = _deviceOffsets[deviceId];
}
if (constantOffset + numBytes >= CONSTANT_LIMIT) {
int8_t *ptr = nullptr;
ALLOCATE_SPECIAL(ptr, workspace, numBytes, int8_t);
@ -154,7 +157,9 @@ namespace nd4j {
if (holder->hasBuffer(dataType)) {
return holder->getConstantDataBuffer(dataType);
} else {
auto cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)];
auto numBytes = descriptor.length() * DataTypeUtils::sizeOf(dataType);
auto cbuff = new int8_t[numBytes];
_counters[deviceId] += numBytes;
// create buffer with this dtype
if (descriptor.isFloat()) {
@ -172,5 +177,13 @@ namespace nd4j {
}
}
Nd4jLong ConstantHelper::getCachedAmount(int deviceId) {
int numDevices = getNumberOfDevices();
if (deviceId > numDevices || deviceId < 0)
return 0L;
else
return _counters[deviceId];
}
nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
}

View File

@ -30,11 +30,11 @@ namespace nd4j {
_rIterations = runIterations;
}
void BenchmarkHelper::printHeader() {
nd4j_printf("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n","");
std::string BenchmarkHelper::printHeader() {
return std::string("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n");
}
void BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) {
std::string BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) {
for (uint i = 0; i < _wIterations; i++)
benchmark.executeOnce();
@ -57,9 +57,9 @@ namespace nd4j {
std::sort(timings.begin(), timings.end());
Nd4jLong median = timings[_rIterations / 2];
NDArray n = NDArrayFactory::create(timings, LaunchContext::defaultContext());
auto n = NDArrayFactory::create(timings, LaunchContext::defaultContext());
double stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
auto stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
auto min = n.reduceNumber(nd4j::reduce::Min).e<Nd4jLong>(0);
auto max = n.reduceNumber(nd4j::reduce::Max).e<Nd4jLong>(0);
@ -71,10 +71,16 @@ namespace nd4j {
auto a = benchmark.axis();
auto inpl = benchmark.inplace();
std::string temp;
temp.resize(65536);
// printing out stuff
nd4j_printf("%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(),
snprintf(const_cast<char *>(temp.data()), temp.length(), "%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(),
_wIterations, _rIterations, t.c_str(), inpl.c_str(), s.c_str(), strides.c_str(), a.c_str(), o.c_str(),
nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
auto pos = temp.find('\n');
return temp.substr(0, pos + 1);
}
void BenchmarkHelper::benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z) {
@ -126,47 +132,44 @@ namespace nd4j {
nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
}
void BenchmarkHelper::runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg) {
std::string BenchmarkHelper::runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg) {
std::vector<OpBenchmark*> ops(benchmarks);
runOperationSuit(ops, msg);
return runOperationSuit(ops, msg);
}
void BenchmarkHelper::runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg) {
std::string BenchmarkHelper::runOperationSuit(OpBenchmark* benchmark) {
return benchmarkOperation(*benchmark);
}
std::string BenchmarkHelper::runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg) {
std::string result;
if (msg != nullptr && postHeaders) {
nd4j_printf("\n%s\n", msg);
result += "\n";
result += msg;
result += "\n";
}
if (postHeaders)
printHeader();
result += printHeader();
for (auto v:benchmarks)
benchmarkOperation(*v);
result += benchmarkOperation(*v);
return result;
}
void BenchmarkHelper::runScalarSuit() {
printHeader();
std::initializer_list<std::initializer_list<Nd4jLong>> shapes = {{100}, {32, 256}, {32, 150, 200}, {32, 3, 244, 244}, {32, 64, 128, 256}};
std::initializer_list<nd4j::DataType> dataTypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE};
std::initializer_list<nd4j::scalar::Ops> ops = {scalar::Add, scalar::Divide, scalar::Pow};
for (const auto &d:dataTypes) {
for (const auto &o:ops) {
for (const auto &s:shapes) {
//benchmarkScalarOperation(o, 2.0, s, d);
}
}
}
}
void BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string result;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
result += "\n";
result += message;
result += "\n";
}
printHeader();
result += printHeader();
std::vector<OpBenchmark*> list;
@ -175,25 +178,26 @@ namespace nd4j {
auto clone = reinterpret_cast<DeclarableBenchmark*>(op->clone());
clone->setContext(ctx);
list.emplace_back(clone);
result += runOperationSuit(clone);
delete clone;
}
runOperationSuit(list, false);
// removing everything
for (auto v:list) {
delete reinterpret_cast<DeclarableBenchmark*>(v);
}
return result;
}
void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -217,16 +221,20 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ScalarBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string output;
ResultSet x;
x.setNonRemovable();
ResultSet z;
@ -248,23 +256,27 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, message);
output += runOperationSuit(result, message);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ScalarBenchmark*>(v);
}
return output;
}
void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -288,16 +300,20 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<TransformBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string output;
ResultSet x;
x.setNonRemovable();
ResultSet z;
@ -319,22 +335,27 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, message);
output += runOperationSuit(result, message);
// removing everything
for (auto v:result) {
delete reinterpret_cast<TransformBenchmark*>(v);
}
return output;
}
void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string output;
auto parameters = parametersBatch.parameters();
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -358,16 +379,19 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ReductionBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
std::string output;
ResultSet x;
x.setNonRemovable();
ResultSet z;
@ -389,19 +413,24 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, message);
output += runOperationSuit(result, message);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ReductionBenchmark*>(v);
}
return output;
}
void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
@ -436,16 +465,20 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ReductionBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
std::string output;
ResultSet x;
x.setNonRemovable();
ResultSet y;
@ -474,22 +507,27 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, message);
output += runOperationSuit(result, message);
// removing everything
for (auto v:result) {
delete reinterpret_cast<ReductionBenchmark*>(v);
}
return output;
}
void BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -518,23 +556,28 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<BroadcastBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -562,16 +605,20 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<PairwiseBenchmark*>(v);
}
}
return output;
}
void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
std::string output;
ResultSet x;
x.setNonRemovable();
ResultSet y;
@ -597,22 +644,27 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, message);
output += runOperationSuit(result, message);
// removing everything
for (auto v:result) {
delete reinterpret_cast<PairwiseBenchmark*>(v);
}
return output;
}
void BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
std::string BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
auto parameters = parametersBatch.parameters();
std::string output;
if (message != nullptr) {
nd4j_printf("\n%s\n", message);
output += "\n";
output += message;
output += "\n";
}
printHeader();
output += printHeader();
for (auto &p: parameters) {
ResultSet x;
@ -637,12 +689,14 @@ namespace nd4j {
result.emplace_back(clone);
}
runOperationSuit(result, false);
output += runOperationSuit(result, false);
// removing everything
for (auto v:result) {
delete reinterpret_cast<MatrixBenchmark*>(v);
}
}
return output;
}
}

View File

@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
std::vector<int> dimsOut(indices->rankOf());
std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1
const Nd4jLong numOfSubArrs = indices->lengthOf();
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
for(int i = 0; i < numOfSubArrs; ++i) {
NDArray subArrOut = (*output)(i, dimsOut);
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
@ -72,7 +72,7 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->eleme
}
else { // vector case
const Nd4jLong numOfSubArrs = intArgs.size() - 1;
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
for(int i = 0; i < numOfSubArrs; ++i) {
NDArray subArrOut = (*output)(i, {axis});
NDArray subArrIn = (*input)(intArgs[i+1], {axis});

View File

@ -171,15 +171,8 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast
const int numUnits = cLast->sizeAt(1);
//Concat inputs: [xt, yt-1]: concat([bs,nIn],[bs,nOut]) -> [bs, (nIn+nOut)]
nd4j::ops::concat concat;
Context cContext(119);
auto concatOut = NDArrayFactory::create(xt->ordering(), {xt->sizeAt(0), xt->sizeAt(1) + yLast->sizeAt(1)}, xt->dataType(), xt->getContext());
cContext.setInputArray(0, const_cast<NDArray*>(xt), false);
cContext.setInputArray(1, const_cast<NDArray*>(yLast), false);
cContext.setOutputArray(0, &concatOut, false);
cContext.getIArguments()->emplace_back(1);
concat.execute(&cContext);
helpers::concat(xt->getContext(), {const_cast<NDArray*>(xt), const_cast<NDArray*>(yLast)}, concatOut, {1});
//NDArray* NDArrayFactory::create_( const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dataType, nd4j::memory::Workspace* workspace) {
std::vector<Nd4jLong> shape = {bS, 4*numUnits};

View File

@ -45,10 +45,26 @@ namespace nd4j {
const NDArray* iSeq, const NDArray* cSeq, const NDArray* fSeq, const NDArray* oSeq, const NDArray* zSeq,
const NDArray* hSeq, const NDArray* ySeq, const std::vector<double>& params, const int dataFormat){
const int seqLen = xSeq->sizeAt(0);
const int mb = xSeq->sizeAt(1);
const int inSize = xSeq->sizeAt(2);
const int outSize = iSeq->sizeAt(2);
int seqLen, mb, inSize, outSize;
if(dataFormat == 0) {
seqLen = xSeq->sizeAt(0);
mb = xSeq->sizeAt(1);
inSize = xSeq->sizeAt(2);
outSize = iSeq->sizeAt(2);
}
else if(dataFormat == 1) {
seqLen = xSeq->sizeAt(2);
mb = xSeq->sizeAt(0);
inSize = xSeq->sizeAt(1);
outSize = iSeq->sizeAt(1);
}
else if(dataFormat == 2) {
seqLen = xSeq->sizeAt(1);
mb = xSeq->sizeAt(0);
inSize = xSeq->sizeAt(2);
outSize = iSeq->sizeAt(2);
}
const std::vector<Nd4jLong> inSliceShape({mb,inSize});
const std::vector<Nd4jLong> outSliceShape({mb,outSize});

View File

@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef LIBND4J_BENCHMARKSUIT_H
#define LIBND4J_BENCHMARKSUIT_H
#include <string>
#include <pointercast.h>
#include <dll.h>
#include <BenchmarkHelper.h>
#include <NDArrayFactory.h>
namespace nd4j {
class ND4J_EXPORT BenchmarkSuit {
public:
BenchmarkSuit() = default;
~BenchmarkSuit() = default;
virtual std::string runSuit() = 0;
};
}
#endif //DEV_TESTS_BENCHMARKSUIT_H

View File

@ -0,0 +1,34 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef LIBND4J_FULLBENCHMARKSUIT_H
#define LIBND4J_FULLBENCHMARKSUIT_H
#include <performance/benchmarking/BenchmarkSuit.h>
namespace nd4j {
class FullBenchmarkSuit : public BenchmarkSuit {
public:
std::string runSuit() override;
};
}
#endif //DEV_TESTS_FULLBENCHMARKSUIT_H

View File

@ -0,0 +1,34 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef LIBND4J_LIGHTBENCHMARKSUIT_H
#define LIBND4J_LIGHTBENCHMARKSUIT_H
#include <performance/benchmarking/BenchmarkSuit.h>
namespace nd4j {
class LightBenchmarkSuit : public BenchmarkSuit {
public:
std::string runSuit() override;
};
}
#endif //DEV_TESTS_LIGHTBENCHMARKSUIT_H

View File

@ -0,0 +1,20 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <performance/benchmarking/BenchmarkSuit.h>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,639 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <ops/declarable/CustomOperations.h>
#include "performance/benchmarking/LightBenchmarkSuit.h"
#ifdef _RELEASE
#define WARMUP 3
#define NUM_ITER 10
#else
#define WARMUP 0
#define NUM_ITER 1
#endif
namespace nd4j {
template <typename T>
static std::string transformBenchmark() {
std::string output;
output += "transformBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20 - 4MB
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
}
};
ScalarBenchmark sbRelu(scalar::Ops::RELU, "RELU");
sbRelu.setY(NDArrayFactory::create_<T>(0.0));
TransformBenchmark tbSigmoid(transform::StrictOps::Sigmoid, "sigmoid");
TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");
output += helper.runOperationSuit(&sbRelu, generator, batch, "RELU");
output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Sigmoid");
output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Softmax");
return output;
}
template <typename T>
static std::string scalarBenchmark() {
std::string output;
output += "scalarBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XZ() {
auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
arr->assign(1.0);
x.push_back(arr);
if(p.getIntParam("inplace") == 1){
z.push_back(arr);
} else {
z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
}
};
ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");
sbAdd.setY(NDArrayFactory::create_<T>(3.14159265359));
sbDiv.setY(NDArrayFactory::create_<T>(3.14159265359));
sbPow.setY(NDArrayFactory::create_<T>(3.14159265359));
output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359)");
output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359)");
output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359)");
return output;
}
template <typename T>
static std::string pairwiseBenchmark() {
std::string output;
output += "pairwiseBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
IntPowerParameters length("length", 2, 8, 20, 4); //2^4 to 2^20 in steps of 4 - 2^4, 2^8, 2^16, 2^20
BoolParameters inplace("inplace");
ParametersBatch batch({&length, &inplace});
auto generator = PARAMETRIC_XYZ() {
auto arr1 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
auto arr2 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
x.push_back(arr1);
y.push_back(arr2);
if(p.getIntParam("inplace") == 1){
z.push_back(arr1);
} else {
z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
}
};
PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");
PairwiseBenchmark pb2(pairwise::Ops::Divide, "Divide");
output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Divide");
return output;
}
static std::string mismatchedOrderAssign() {
std::string output;
BenchmarkHelper helper(WARMUP, NUM_ITER);
IntPowerParameters rows("rows", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20
BoolParameters cf("cf");
ParametersBatch batch({&rows, &cf});
auto generator = PARAMETRIC_XZ() {
int numElements = 4194304; //2^24
int rows = p.getIntParam("rows");
int cols = numElements / rows;
bool c = p.getIntParam("cf");
auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
x.push_back(arr);
z.push_back(arr2);
};
TransformBenchmark tb(transform::AnyOps::Assign, "assign");
output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign F32");
//Also test: NCHW to NHWC and back
BoolParameters nchw("nchw");
int mb = 8;
int hw = 64;
int c = 3;
ParametersBatch batch2({&nchw});
auto generator2 = PARAMETRIC_XZ() {
bool nchw = p.getIntParam("nchw");
if(nchw) {
auto orig = NDArrayFactory::create_<float>('c', {mb, c, hw, hw});
orig->permutei({0,2,3,1});
x.push_back(orig);
z.push_back(NDArrayFactory::create_<float>('c', {mb, hw, hw, c}));
} else {
auto orig = NDArrayFactory::create_<float>('c', {mb, hw, hw, c});
orig->permutei({0,3,1,2});
x.push_back(orig);
z.push_back(NDArrayFactory::create_<float>('c', {mb, c, hw, hw}));
}
};
TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign FP32");
return output;
}
template <typename T>
static std::string gemmBenchmark() {
std::string output;
output += "gemm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
for (int o = 0; o <= 1; o++) {
char resultOrder = (o == 0 ? 'f' : 'c');
IntPowerParameters sz("sz", 2, 4, 10, 2); //2^4=16, ..., 2^10=1024 -> 4 elements
ParametersBatch b({&sz});
auto generator = PARAMETRIC_XYZ() {
auto a = p.getIntParam("sz");
auto b = p.getIntParam("sz");
auto c = p.getIntParam("sz");
std::vector<Nd4jLong> shapeA;
std::vector<Nd4jLong> shapeB;
shapeA = {a, b};
shapeB = {b, c};
auto A = NDArrayFactory::create_<T>('c', shapeA);
auto B = NDArrayFactory::create_<T>('c', shapeB);
auto C = NDArrayFactory::create_<T>(resultOrder, {a, c});
x.push_back(A);
y.push_back(B);
z.push_back(C);
};
std::string n;
n += "Gemm - cOrder=";
n += resultOrder;
MatrixBenchmark mb(1.0, 0.0, false, false, n);
output += helper.runOperationSuit(&mb, generator, b, n.c_str());
}
return output;
}
template <typename T>
static std::string reduceFullBenchmark() {
std::string output;
output += "reduceFullBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20
ParametersBatch batch({&length});
auto generator = PARAMETRIC_XYZ() {
auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
x.push_back(arr);
y.push_back(nullptr);
z.push_back(NDArrayFactory::create_<T>(0.0f));
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
ReductionBenchmark rbProd(reduce::SameOps::Prod, "prod");
ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
output += helper.runOperationSuit(&rbProd, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Product - Full Array Reduction");
output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Maximum - Full Array Reduction");
//Index reduction
nd4j::ops::argmax opArgmax;
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
auto generator3 = PARAMETRIC_D(){
auto ctx = new Context(1);
ctx->setInputArray(0, NDArrayFactory::create_<T>('c', {p.getIntParam("length")}), true);
ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong>((Nd4jLong)0), true);
ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>(0), true);
return ctx;
};
output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
return output;
}
template <typename T>
static std::string reduceDimBenchmark(){
std::string output;
output += "reduceDimBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
int length[] = {1024*1024};
int pow[] = {10};
for( int i=0; i<1; i++ ){
IntPowerParameters rows("rows", 2, 0, pow[i], 2);
BoolParameters dim("dim");
ParametersBatch batch({&rows, &dim});
auto generator = PARAMETRIC_XYZ() {
int rows = p.getIntParam("rows");
int cols = length[i] / rows;
int dim = p.getIntParam("dim");
auto arr = NDArrayFactory::create_<T>('c', {rows, cols});
x.push_back(arr);
y.push_back(NDArrayFactory::create_<Nd4jLong>(dim));
NDArray* result;
if(dim == 0){
result = NDArrayFactory::create_<T>('c', {cols});
} else {
result = NDArrayFactory::create_<T>('c', {rows});
}
z.push_back(result);
};
ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
std::string s1("Sum Along Dimension - ");
s1 += std::to_string(length[i]);
std::string s3("Maximum Along Dimension - ");
s3 += std::to_string(length[i]);
output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s3.c_str());
auto generator3 = PARAMETRIC_D(){
auto ctx = new Context(1);
int rows = p.getIntParam("rows");
int cols = length[i] / rows;
int dim = p.getIntParam("dim");
auto arr = NDArrayFactory::create_<T>('c', {rows, cols});
auto dimArg = new Nd4jLong[1];
dimArg[0] = dim;
ctx->setIArguments(dimArg, 1);
delete[] dimArg;
ctx->setInputArray(0, arr, true);
NDArray* result;
if(dim == 0){
result = NDArrayFactory::create_<Nd4jLong>('c', {cols});
} else {
result = NDArrayFactory::create_<Nd4jLong>('c', {rows});
}
ctx->setOutputArray(0, result, true);
return ctx;
};
std::string s5("Argmax Along Dimension - ");
s5 += std::to_string(length[i]);
nd4j::ops::argmax opArgmax;
DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
}
return output;
}
template <typename T>
static std::string conv2d(){
std::string output;
output += "conv2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
//Convolution2D op
BoolParameters nhwc("nhwc");
PredefinedParameters k("k", {2, 3});
ParametersBatch batch({&nhwc, &k});
nd4j::ops::conv2d conv2d;
DeclarableBenchmark benchmark(conv2d, "conv2d");
int hw = 64;
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int khw = p.getIntParam("k");
if (n == 0) {
auto input = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
auto output = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
} else {
auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
}
auto b = NDArrayFactory::create_<T>('c', {3});
auto w = NDArrayFactory::create_<T>('c', {khw, khw, 3, 3}); // [kH, kW, iC, oC] always
ctx->setInputArray(1, w, true);
ctx->setInputArray(2, b, true);
auto args = new Nd4jLong[10];
args[0] = args[1] = khw; //Kernel
args[2] = args[3] = 1;//Stride
args[4] = args[5] = 0; //Pad
args[6] = args[7] = 1; //Dilation
args[8] = 1; //SAME
args[9] = n;//0-nchw, 1=nhwc
ctx->setIArguments(args, 10);
delete[] args;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d");
return output;
}
template <typename T>
static std::string pool2d() {
std::string output;
output += "pool2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
//Convolution2D op
BoolParameters nhwc("nhwc");
PredefinedParameters k("k", {2, 3});
ParametersBatch batch({&nhwc, &k});
int c = 3;
int hw = 64;
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int n = p.getIntParam("nhwc");
int khw = p.getIntParam("k");
if (n == 0) {
auto input = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
auto output = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
} else {
auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
ctx->setInputArray(0, input, true);
ctx->setOutputArray(0, output, true);
}
auto args = new Nd4jLong[11];
args[0] = args[1] = khw; //Kernel
args[2] = args[3] = 1;//Stride
args[4] = args[5] = 0; //Pad
args[6] = args[7] = 1; //Dilation
args[8] = 1; //SAME
args[9] = 0; //Divisor mode - 0 = exclude padding in divisor
args[10] = n;//0-nchw, 1=nhwc
ctx->setIArguments(args, 11);
delete[] args;
return ctx;
};
nd4j::ops::avgpool2d avgpool2d;
DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pool 2d");
nd4j::ops::maxpool2d maxpool2d;
DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pool 2d");
return output;
}
template <typename T>
static std::string lstmBenchmark() {
std::string output;
output += "lstm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
BenchmarkHelper helper(WARMUP, NUM_ITER);
BoolParameters format("format"); //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
PredefinedParameters mb("mb", {1, 8});
int n = 128;
ParametersBatch batch({&format, &mb});
nd4j::ops::lstmBlock lstmBlock;
DeclarableBenchmark benchmark(lstmBlock, "lstm");
int seqLength = 8;
auto generator = PARAMETRIC_D() {
auto ctx = new Context(1);
int f = p.getIntParam("format");
int m = p.getIntParam("mb");
Nd4jLong l = 0;
ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l), true); //Max TS length (unused)
if (f == 0) {
//TNS format
ctx->setInputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //x
ctx->setOutputArray(0, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //i
ctx->setOutputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //c
ctx->setOutputArray(2, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //f
ctx->setOutputArray(3, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //o
ctx->setOutputArray(4, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //z
ctx->setOutputArray(5, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //h
ctx->setOutputArray(6, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true); //y
} else {
//NST format
ctx->setInputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //x
ctx->setOutputArray(0, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //i
ctx->setOutputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //c
ctx->setOutputArray(2, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //f
ctx->setOutputArray(3, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //o
ctx->setOutputArray(4, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //z
ctx->setOutputArray(5, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //h
ctx->setOutputArray(6, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true); //y
}
auto cLast = NDArrayFactory::create_<T>('c', {m, n});
auto yLast = NDArrayFactory::create_<T>('c', {m, n});
auto W = NDArrayFactory::create_<T>('c', {2 * n, 4 * n});
auto Wci = NDArrayFactory::create_<T>('c', {n});
auto Wcf = NDArrayFactory::create_<T>('c', {n});
auto Wco = NDArrayFactory::create_<T>('c', {n});
auto b = NDArrayFactory::create_<T>('c', {4 * n});
ctx->setInputArray(2, cLast, true);
ctx->setInputArray(3, yLast, true);
ctx->setInputArray(4, W, true);
ctx->setInputArray(5, Wci, true);
ctx->setInputArray(6, Wcf, true);
ctx->setInputArray(7, Wco, true);
ctx->setInputArray(8, b, true);
auto iargs = new Nd4jLong[2];
iargs[0] = 0; //No peephole
iargs[1] = f;
ctx->setIArguments(iargs, 2);
delete[] iargs;
auto targs = new double[2];
targs[0] = 1.0; //forget bias
targs[1] = 0.0; //cell clipping value
ctx->setTArguments(targs, 2);
delete[] targs;
return ctx;
};
output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
return output;
}
static std::string broadcast2d() {
std::string output;
BenchmarkHelper helper(WARMUP, NUM_ITER);
int rows = 65536;
IntPowerParameters cols("cols", 2, 2, 12, 4); //2^2 to 2^12 in steps of 2 - 2^1=2, ..., 2^10=1024
BoolParameters axis("axis");
BoolParameters inplace("inplace");
ParametersBatch batch({&cols, &axis, &inplace});
auto generator = PARAMETRIC_D() {
auto a = p.getIntParam("axis");
auto arr = NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")});
auto ctx = new Context(1);
ctx->setInputArray(0, arr, true);
if(a == 0){
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {rows, 1}), true);
} else {
ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
}
if (p.getIntParam("inplace") == 1) {
ctx->setOutputArray(0, arr);
ctx->markInplace(true);
} else {
ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")}), true);
}
return ctx;
};
std::string s("add");
nd4j::ops::add op;
DeclarableBenchmark benchmark(op, "add");
output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
return output;
}
std::string LightBenchmarkSuit::runSuit() {
#ifdef _RELEASE
std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF});
#else
std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32});
#endif
std::string result;
for (auto t:dtypes) {
nd4j_printf("Running LightBenchmarkSuite.transformBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += transformBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.scalarBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += scalarBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.reduceDimBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += reduceDimBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.gemmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += gemmBenchmark, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.conv2d [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += conv2d, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.pool2d [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += pool2d, (), LIBND4J_TYPES);
nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES);
}
nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", "");
result += broadcast2d();
nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", "");
result += mismatchedOrderAssign();
return result;
}
}

View File

@ -311,3 +311,27 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_1) {
delete result;
}
TEST_F(DeclarableOpsTests15, test_lstmBlock_2) {
int seqLength = 32;
int m = 64;
int n = 32;
auto x0 = NDArrayFactory::create<Nd4jLong>(5);
auto x1 = NDArrayFactory::create<float>('f', {m, n, seqLength});
auto x2 = NDArrayFactory::create<float>('f', {m, n});
auto x3 = NDArrayFactory::create<float>('f', {m, n});
auto x4 = NDArrayFactory::create<float>('f', {2 * n, 4 * n});
auto x5 = NDArrayFactory::create<float>('f', {n});
auto x6 = NDArrayFactory::create<float>('f', {n});
auto x7 = NDArrayFactory::create<float>('f', {n});
auto x8 = NDArrayFactory::create<float>('f', {4 * n});
nd4j::ops::lstmBlock op;
auto result = op.execute({&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8}, {1.0, 0.0}, {0, 1});
ASSERT_EQ(Status::OK(), result->status());
auto z = result->at(0);
delete result;
}

View File

@ -38,6 +38,8 @@
#include <helpers/ConstantShapeHelper.h>
#include <helpers/ConstantTadHelper.h>
#include <array>
#include <performance/benchmarking/FullBenchmarkSuit.h>
#include <performance/benchmarking/LightBenchmarkSuit.h>
using namespace nd4j;
using namespace nd4j::graph;
@ -164,6 +166,12 @@ TEST_F(PlaygroundTests, BroadcastOps2d) {
}
*/
TEST_F(PlaygroundTests, test_benchmark_suit_1) {
//LightBenchmarkSuit suit;
//auto output = suit.runSuit();
//nd4j_printf("SUIT OUTPUT\n%s\n", output.data());
}
TEST_F(PlaygroundTests, test_small_reductions) {
auto f = NDArrayFactory::create<float>('c', {1024 ,1024});
f.assign(1.0f);

View File

@ -193,6 +193,7 @@ if ("${OPENBLAS}" OR CMAKE_BUILD_TYPE STREQUAL "Release")
endif()
endif()
file(GLOB_RECURSE PERF_SOURCES false ../../include/performance/*.cpp ../../include/performance/*.h)
file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../../include/exceptions/*.cpp ../../include/exceptions/*.h)
file(GLOB_RECURSE EXEC_SOURCES false ../../include/execution/*.cpp ../../include/execution/*.h)
file(GLOB_RECURSE TYPES_SOURCES false ../../include/types/*.cpp ../../include/types/*.h)
@ -234,7 +235,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas
../../include/cnpy/cnpy.cpp ../../include/nd4jmemset.h ../../include/nd4jmalloc.h
../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES}
${OPS_SOURCES} ${TEST_SOURCES})
${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES})
target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES})

View File

@ -917,4 +917,14 @@ public class DefaultOpExecutioner implements OpExecutioner {
public DataBuffer createConstantBuffer(double[] values, DataType desiredType) {
throw new UnsupportedOperationException();
}
@Override
public String runLightBenchmarkSuit(boolean printOut) {
throw new UnsupportedOperationException();
}
@Override
public String runFullBenchmarkSuit(boolean printOut) {
throw new UnsupportedOperationException();
}
}

View File

@ -463,4 +463,8 @@ public interface OpExecutioner {
DataBuffer createConstantBuffer(int[] values, DataType desiredType);
DataBuffer createConstantBuffer(float[] values, DataType desiredType);
DataBuffer createConstantBuffer(double[] values, DataType desiredType);
String runLightBenchmarkSuit(boolean printOut);
String runFullBenchmarkSuit(boolean printOut);
}

View File

@ -1386,15 +1386,39 @@ public class Nd4j {
*/
public static DataBuffer createBufferDetached(int[] shape, DataType type) {
long length = ArrayUtil.prodLong(shape);
if (type == DataType.INT)
return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
if (type == DataType.LONG)
return DATA_BUFFER_FACTORY_INSTANCE.createLong(new long[]{length});
else if (type == DataType.HALF)
switch (type){
case DOUBLE:
return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
case FLOAT:
return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
case HALF:
return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length);
return type == DataType.DOUBLE ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
case BFLOAT16:
return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length);
case UINT64:
return DATA_BUFFER_FACTORY_INSTANCE.createULong(length);
case LONG:
return DATA_BUFFER_FACTORY_INSTANCE.createLong(length);
case UINT32:
return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length);
case INT:
return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
case UINT16:
return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length);
case SHORT:
return DATA_BUFFER_FACTORY_INSTANCE.createShort(length);
case UBYTE:
return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length);
case BYTE:
return DATA_BUFFER_FACTORY_INSTANCE.createByte(length);
case BOOL:
return DATA_BUFFER_FACTORY_INSTANCE.createBool(length);
case UTF8:
case COMPRESSED:
case UNKNOWN:
default:
throw new UnsupportedOperationException("Cannot create type: " + type);
}
}
/**
@ -1403,16 +1427,39 @@ public class Nd4j {
public static DataBuffer createBuffer(long[] shape, DataType type) {
long length = ArrayUtil.prodLong(shape);
if (type == DataType.INT)
switch (type) {
case BOOL:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case UBYTE:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case UINT16:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case UINT32:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case UINT64:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case BYTE:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case SHORT:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case INT:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
else if (type == DataType.LONG)
case LONG:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
else if (type == DataType.HALF)
case HALF:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
else if (type == DataType.DOUBLE)
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
else
case BFLOAT16:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case FLOAT:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case DOUBLE:
return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
case UTF8:
case COMPRESSED:
case UNKNOWN:
default:
throw new UnsupportedOperationException("Cannot create type: " + type);
}
}
@ -1424,19 +1471,31 @@ public class Nd4j {
switch (type){
case DOUBLE:
DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
case FLOAT:
DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
case HALF:
return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length);
case BFLOAT16:
return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length);
case UINT64:
return DATA_BUFFER_FACTORY_INSTANCE.createULong(length);
case LONG:
return DATA_BUFFER_FACTORY_INSTANCE.createLong(length);
case UINT32:
return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length);
case INT:
return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
case UINT16:
return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length);
case SHORT:
return DATA_BUFFER_FACTORY_INSTANCE.createShort(length);
case UBYTE:
return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length);
case BYTE:
return DATA_BUFFER_FACTORY_INSTANCE.createByte(length);
case BOOL:
return DATA_BUFFER_FACTORY_INSTANCE.createBool(length);
case UTF8:
case COMPRESSED:
case UNKNOWN:

View File

@ -1161,4 +1161,10 @@ public abstract class NativeOps extends Pointer {
public abstract Pointer constantBuffer(int dtype, DoublePointer data, int length);
public abstract Pointer constantBuffer(int dtype, @Cast("Nd4jLong *") LongPointer data, int length);
public abstract String runLightBenchmarkSuit(boolean printOut);
public abstract String runFullBenchmarkSuit(boolean printOut);
public abstract long getCachedMemory(int deviceId);
}

View File

@ -418,6 +418,126 @@ public class CudaDataBufferFactory implements DataBufferFactory {
return new CudaIntDataBuffer(length);
}
@Override
public DataBuffer createBFloat16(long length) {
return new CudaBfloat16DataBuffer(length);
}
@Override
public DataBuffer createUInt(long length) {
return new CudaUInt32DataBuffer(length);
}
@Override
public DataBuffer createUShort(long length) {
return new CudaUInt16DataBuffer(length);
}
@Override
public DataBuffer createUByte(long length) {
return new CudaUByteDataBuffer(length);
}
@Override
public DataBuffer createULong(long length) {
return new CudaUInt64DataBuffer(length);
}
@Override
public DataBuffer createBool(long length) {
return new CudaBoolDataBuffer(length);
}
@Override
public DataBuffer createShort(long length) {
return new CudaShortDataBuffer(length);
}
@Override
public DataBuffer createByte(long length) {
return new CudaByteDataBuffer(length);
}
@Override
public DataBuffer createBFloat16(long length, boolean initialize) {
return new CudaBfloat16DataBuffer(length, initialize);
}
@Override
public DataBuffer createUInt(long length, boolean initialize) {
return new CudaUInt32DataBuffer(length, initialize);
}
@Override
public DataBuffer createUShort(long length, boolean initialize) {
return new CudaUInt16DataBuffer(length, initialize);
}
@Override
public DataBuffer createUByte(long length, boolean initialize) {
return new CudaUByteDataBuffer(length, initialize);
}
@Override
public DataBuffer createULong(long length, boolean initialize) {
return new CudaUInt64DataBuffer(length, initialize);
}
@Override
public DataBuffer createBool(long length, boolean initialize) {
return new CudaBoolDataBuffer(length, initialize);
}
@Override
public DataBuffer createShort(long length, boolean initialize) {
return new CudaShortDataBuffer(length, initialize);
}
@Override
public DataBuffer createByte(long length, boolean initialize) {
return new CudaByteDataBuffer(length, initialize);
}
@Override
public DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaBfloat16DataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaUInt32DataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaUInt16DataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaUByteDataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaUInt64DataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaBoolDataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaShortDataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace) {
return new CudaByteDataBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createInt(long length, boolean initialize) {
return new CudaIntDataBuffer(length, initialize);

View File

@ -2757,6 +2757,16 @@ public class CudaExecutioner extends DefaultOpExecutioner {
return buffer;
}
@Override
public String runLightBenchmarkSuit(boolean printOut) {
return nativeOps.runLightBenchmarkSuit(printOut);
}
@Override
public String runFullBenchmarkSuit(boolean printOut) {
return nativeOps.runFullBenchmarkSuit(printOut);
}
}

View File

@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
*/
public native int getDeviceMajor(int deviceId);
/**
* This method returns amount of cached memory
* @param deviceId
* @return
*/
public native @Cast("Nd4jLong") long getCachedMemory(int deviceId);
/**
*
* @param ptrToDeviceId
@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId);
public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer);
@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length);
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut);
public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut);
}

View File

@ -2185,4 +2185,15 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
sb.append(". Output var names: ").append(Arrays.toString(outNames));
}
}
@Override
public String runLightBenchmarkSuit(boolean printOut) {
return loop.runLightBenchmarkSuit(printOut);
}
@Override
public String runFullBenchmarkSuit(boolean printOut) {
return loop.runFullBenchmarkSuit(printOut);
}
}

View File

@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
*/
public native int getDeviceMajor(int deviceId);
/**
* This method returns amount of cached memory
* @param deviceId
* @return
*/
public native @Cast("Nd4jLong") long getCachedMemory(int deviceId);
/**
*
* @param ptrToDeviceId
@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId);
public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer);
public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer);
@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length);
public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut);
public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut);
}

View File

@ -3306,6 +3306,28 @@ public class Nd4jTestsC extends BaseNd4jTest {
log.info("arrayf data: {}", Arrays.toString(arrayf.data().asFloat()));
}
@Test
public void testCreateDetached_1() {
val shape = new int[]{10};
val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF};
for(DataType dt : dataTypes){
val dataBuffer = Nd4j.createBufferDetached(shape, dt);
assertEquals(dt, dataBuffer.dataType());
}
}
@Test
public void testCreateDetached_2() {
val shape = new long[]{10};
val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF};
for(DataType dt : dataTypes){
val dataBuffer = Nd4j.createBufferDetached(shape, dt);
assertEquals(dt, dataBuffer.dataType());
}
}
@Test
public void testPairwiseMixedC() {
int[] shape2 = {12, 8};
@ -7889,6 +7911,7 @@ public class Nd4jTestsC extends BaseNd4jTest {
assertEquals(Nd4j.createFromArray(1f, 3f, 4f), out);
}
private static INDArray fwd(INDArray input, INDArray W, INDArray b){
INDArray ret = Nd4j.createUninitialized(input.size(0), W.size(1));
input.mmuli(W, ret);

View File

@ -355,6 +355,7 @@ public interface DataBufferFactory {
DataBuffer create(DataType dataType, long length, boolean initialize, MemoryWorkspace workspace);
/**
* Create an int data buffer
*
@ -363,6 +364,33 @@ public interface DataBufferFactory {
*/
DataBuffer createInt(long length);
DataBuffer createBFloat16(long length);
DataBuffer createByte(long length);
DataBuffer createShort(long length);
DataBuffer createBool(long length);
DataBuffer createUShort(long length);
DataBuffer createUInt(long length);
DataBuffer createUByte(long length);
DataBuffer createULong(long length);
DataBuffer createBFloat16(long length, boolean initialize);
DataBuffer createByte(long length, boolean initialize);
DataBuffer createShort(long length, boolean initialize);
DataBuffer createBool(long length, boolean initialize);
DataBuffer createUShort(long length, boolean initialize);
DataBuffer createUInt(long length, boolean initialize);
DataBuffer createUByte(long length, boolean initialize);
DataBuffer createULong(long length, boolean initialize);
DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace);
DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace);
/**
* Create an int data buffer, with optional initialization
*

View File

@ -354,11 +354,132 @@ public class DefaultDataBufferFactory implements DataBufferFactory {
return new IntBuffer(length);
}
@Override
public DataBuffer createBFloat16(long length) {
return new BFloat16Buffer(length);
}
@Override
public DataBuffer createUInt(long length) {
return new UInt32Buffer(length);
}
@Override
public DataBuffer createUShort(long length) {
return new UInt16Buffer(length);
}
@Override
public DataBuffer createUByte(long length) {
return new UInt8Buffer(length);
}
@Override
public DataBuffer createULong(long length) {
return new UInt64Buffer(length);
}
@Override
public DataBuffer createBool(long length) {
return new BoolBuffer(length);
}
@Override
public DataBuffer createShort(long length) {
return new Int16Buffer(length);
}
@Override
public DataBuffer createByte(long length) {
return new Int8Buffer(length);
}
@Override
public DataBuffer createBFloat16(long length, boolean initialize) {
return new BFloat16Buffer(length, initialize);
}
@Override
public DataBuffer createUInt(long length, boolean initialize) {
return new UInt32Buffer(length, initialize);
}
@Override
public DataBuffer createUShort(long length, boolean initialize) {
return new UInt16Buffer(length, initialize);
}
@Override
public DataBuffer createUByte(long length, boolean initialize) {
return new UInt8Buffer(length, initialize);
}
@Override
public DataBuffer createULong(long length, boolean initialize) {
return new UInt64Buffer(length, initialize);
}
@Override
public DataBuffer createBool(long length, boolean initialize) {
return new BoolBuffer(length, initialize);
}
@Override
public DataBuffer createShort(long length, boolean initialize) {
return new Int16Buffer(length, initialize);
}
@Override
public DataBuffer createByte(long length, boolean initialize) {
return new Int8Buffer(length, initialize);
}
@Override
public DataBuffer createInt(long length, boolean initialize) {
return new IntBuffer(length, initialize);
}
@Override
public DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace) {
return new BFloat16Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace) {
return new UInt32Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace) {
return new UInt16Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace) {
return new UInt8Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace) {
return new UInt64Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace) {
return new BoolBuffer(length, initialize, workspace);
}
@Override
public DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace) {
return new Int16Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace) {
return new Int8Buffer(length, initialize, workspace);
}
@Override
public DataBuffer createInt(long length, boolean initialize, MemoryWorkspace workspace) {
return new IntBuffer(length, initialize, workspace);
@ -665,12 +786,12 @@ public class DefaultDataBufferFactory implements DataBufferFactory {
@Override
public DataBuffer createHalf(long length) {
throw new UnsupportedOperationException("FP16 isn't supported for CPU yet");
return new HalfBuffer(length);
}
@Override
public DataBuffer createHalf(long length, boolean initialize) {
throw new UnsupportedOperationException("FP16 isn't supported for CPU yet");
return new HalfBuffer(length, initialize);
}
/**