From cb6654bebb21b7460e2382f6bfb892d1ac84941f Mon Sep 17 00:00:00 2001 From: Alex Black Date: Fri, 12 Jul 2019 15:21:15 +1000 Subject: [PATCH] Add libnd4j benchmarks (#3) This PR adds 2 libnd4j benchmarking suits --- libnd4j/blas/CMakeLists.txt | 8 +- libnd4j/blas/NativeOps.h | 12 + libnd4j/blas/cpu/NDArrayFactory.cpp | 3 + libnd4j/blas/cpu/NativeOps.cpp | 40 + libnd4j/blas/cuda/NativeOps.cu | 39 + libnd4j/include/helpers/BenchmarkHelper.h | 40 +- libnd4j/include/helpers/ConstantHelper.h | 4 + .../helpers/benchmark/DeclarableBenchmark.h | 2 +- .../include/helpers/cpu/ConstantHelper.cpp | 20 +- .../include/helpers/cuda/ConstantHelper.cu | 15 +- .../include/helpers/impl/BenchmarkHelper.cpp | 206 +- .../generic/recurrent/lstmBlock.cpp | 6 +- .../ops/declarable/helpers/cpu/gather.cpp | 4 +- .../ops/declarable/helpers/cpu/lstm.cpp | 9 +- .../ops/declarable/helpers/impl/lstm.cpp | 24 +- .../performance/benchmarking/BenchmarkSuit.h | 41 + .../benchmarking/FullBenchmarkSuit.h | 34 + .../benchmarking/LightBenchmarkSuit.h | 34 + .../benchmarking/impl/BenchmarkSuit.cpp | 20 + .../benchmarking/impl/FullBenchmarkSuit.cpp | 1921 +++++++++++++++++ .../benchmarking/impl/LightBenchmarkSuit.cpp | 639 ++++++ .../layers_tests/DeclarableOpsTests15.cpp | 24 + .../layers_tests/PlaygroundTests.cpp | 8 + .../tests_cpu/libnd4j_tests/CMakeLists.txt | 3 +- .../ops/executioner/DefaultOpExecutioner.java | 10 + .../api/ops/executioner/OpExecutioner.java | 4 + .../java/org/nd4j/linalg/factory/Nd4j.java | 101 +- .../java/org/nd4j/nativeblas/NativeOps.java | 6 + .../buffer/factory/CudaDataBufferFactory.java | 120 + .../ops/executioner/CudaExecutioner.java | 10 + .../java/org/nd4j/nativeblas/Nd4jCuda.java | 12 + .../nativecpu/ops/NativeOpExecutioner.java | 11 + .../java/org/nd4j/nativeblas/Nd4jCpu.java | 12 + .../test/java/org/nd4j/linalg/Nd4jTestsC.java | 23 + .../api/buffer/factory/DataBufferFactory.java | 28 + .../factory/DefaultDataBufferFactory.java | 125 +- 36 files changed, 3473 insertions(+), 145 deletions(-) create mode 100644 libnd4j/include/performance/benchmarking/BenchmarkSuit.h create mode 100644 libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h create mode 100644 libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h create mode 100644 libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp create mode 100644 libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp create mode 100644 libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt index ad8a781a6..8537fc59c 100755 --- a/libnd4j/blas/CMakeLists.txt +++ b/libnd4j/blas/CMakeLists.txt @@ -247,6 +247,7 @@ if(CUDA_BLAS) endif() endif() + file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h) file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h) file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/impl/*.cpp ../include/execution/*.cu ../include/execution/*.h) file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h) @@ -267,7 +268,7 @@ if(CUDA_BLAS) ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} - ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES}) + ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES}) else() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_TESTS=true") @@ -276,7 +277,7 @@ if(CUDA_BLAS) ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} - ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES}) + ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES}) endif() @@ -300,6 +301,7 @@ elseif(CPU_BLAS) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true") endif() + file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h) file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h) file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/*.cpp ../include/execution/*.h) file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h) @@ -320,7 +322,7 @@ elseif(CPU_BLAS) ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES} - ${OPS_SOURCES}) + ${OPS_SOURCES} ${PERF_SOURCES}) if(IOS) add_library(${LIBND4J_NAME} STATIC $) else() diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h index 1c818d528..f28a76836 100755 --- a/libnd4j/blas/NativeOps.h +++ b/libnd4j/blas/NativeOps.h @@ -759,6 +759,13 @@ public: */ int getDeviceMajor(int deviceId); + /** + * This method returns amount of cached memory + * @param deviceId + * @return + */ + Nd4jLong getCachedMemory(int deviceId); + /** * * @param ptrToDeviceId @@ -1653,6 +1660,7 @@ public: int unregisterGraph(Nd4jPointer *extraPointers, Nd4jLong graphId); + void deleteCharArray(Nd4jPointer pointer); void deleteIntArray(Nd4jPointer pointer); void deleteLongArray(Nd4jPointer pointer); void deletePointerArray(Nd4jPointer pointer); @@ -1690,6 +1698,10 @@ public: nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, Nd4jLong *data, int length); nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, double *data, int length); nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor); + + + const char* runLightBenchmarkSuit(bool printOut); + const char* runFullBenchmarkSuit(bool printOut); }; diff --git a/libnd4j/blas/cpu/NDArrayFactory.cpp b/libnd4j/blas/cpu/NDArrayFactory.cpp index 8fcd29eb7..ec99ef7db 100644 --- a/libnd4j/blas/cpu/NDArrayFactory.cpp +++ b/libnd4j/blas/cpu/NDArrayFactory.cpp @@ -204,6 +204,9 @@ template void NDArrayFactory::memcpyFromVector(void *ptr, const std::vector diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index d281bdfac..460c9d4b6 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -72,6 +72,9 @@ bool experimentalSupport = false; #include #include #include +#include +#include +#include using namespace nd4j; @@ -2304,6 +2307,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) { delete[] ptr; } +void NativeOps::deleteCharArray(Nd4jPointer pointer) { + auto ptr = reinterpret_cast(pointer); + delete[] ptr; +} + void NativeOps::deleteIntArray(Nd4jPointer pointer) { auto ptr = reinterpret_cast(pointer); delete[] ptr; @@ -2792,6 +2800,38 @@ void NativeOps::sortTadByValue(Nd4jPointer *extraPointers, BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortTadByValue(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES); } +const char* NativeOps::runLightBenchmarkSuit(bool printOut) { + nd4j::LightBenchmarkSuit suit; + auto result = suit.runSuit(); + + if (printOut) + nd4j_printf("%s\n", result.data()); + + auto chars = new char[result.length()+1]; + std::memcpy(chars, result.data(), result.length()); + chars[result.length()] = (char) 0x0; + + return chars; +} + +Nd4jLong NativeOps::getCachedMemory(int deviceId) { + return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId); +} + +const char* NativeOps::runFullBenchmarkSuit(bool printOut) { + nd4j::FullBenchmarkSuit suit; + auto result = suit.runSuit(); + + if (printOut) + nd4j_printf("%s\n", result.data()); + + auto chars = new char[result.length()+1]; + std::memcpy(chars, result.data(), result.length()); + chars[result.length()] = (char) 0x0; + + return chars; +} + BUILD_SINGLE_TEMPLATE(template void flattenGeneric,(Nd4jPointer*, int, char, void*, Nd4jLong*, void*, Nd4jLong*), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void pullRowsGeneric, (void *, Nd4jLong*, void*, Nd4jLong*, const int, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*), LIBND4J_TYPES); diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 4fa3a36fa..b56d5da94 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -47,6 +47,8 @@ using namespace nd4j; #include +#include +#include cudaDeviceProp *deviceProperties; cudaFuncAttributes *funcAttributes = new cudaFuncAttributes[64]; @@ -2804,6 +2806,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) { delete[] ptr; } +void NativeOps::deleteCharArray(Nd4jPointer pointer) { + auto ptr = reinterpret_cast(pointer); + delete[] ptr; +} + void NativeOps::deleteIntArray(Nd4jPointer pointer) { auto ptr = reinterpret_cast(pointer); delete[] ptr; @@ -3289,3 +3296,35 @@ Nd4jPointer NativeOps::shapeBufferForNumpy(Nd4jPointer npyArray) { } return reinterpret_cast(nd4j::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer, true)); } + +const char* NativeOps::runLightBenchmarkSuit(bool printOut) { + nd4j::LightBenchmarkSuit suit; + auto result = suit.runSuit(); + + if (printOut) + nd4j_printf("%s\n", result.data()); + + auto chars = new char[result.length()+1]; + std::memcpy(chars, result.data(), result.length()); + chars[result.length()] = (char) 0x0; + + return chars; +} + +const char* NativeOps::runFullBenchmarkSuit(bool printOut) { + nd4j::FullBenchmarkSuit suit; + auto result = suit.runSuit(); + + if (printOut) + nd4j_printf("%s\n", result.data()); + + auto chars = new char[result.length()+1]; + std::memcpy(chars, result.data(), result.length()); + chars[result.length()] = (char) 0x0; + + return chars; +} + +Nd4jLong NativeOps::getCachedMemory(int deviceId) { + return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId); +} \ No newline at end of file diff --git a/libnd4j/include/helpers/BenchmarkHelper.h b/libnd4j/include/helpers/BenchmarkHelper.h index 7acea057d..58ed7e1b7 100644 --- a/libnd4j/include/helpers/BenchmarkHelper.h +++ b/libnd4j/include/helpers/BenchmarkHelper.h @@ -50,7 +50,7 @@ namespace nd4j { unsigned int _rIterations; protected: - void benchmarkOperation(OpBenchmark &benchmark); + std::string benchmarkOperation(OpBenchmark &benchmark); void benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z); @@ -58,34 +58,30 @@ namespace nd4j { void benchmarkGEMM(char orderA, std::initializer_list shapeA, char orderB, std::initializer_list shapeB, char orderC, std::initializer_list shapeC); - void printHeader(); + std::string printHeader(); public: BenchmarkHelper(unsigned int warmUpIterations = 10, unsigned int runIterations = 100); - void runOperationSuit(std::initializer_list benchmarks, const char *msg = nullptr); - void runOperationSuit(std::vector &benchmarks, bool postHeaders, const char *msg = nullptr); + std::string runOperationSuit(std::initializer_list benchmarks, const char *msg = nullptr); + std::string runOperationSuit(std::vector &benchmarks, bool postHeaders, const char *msg = nullptr); + std::string runOperationSuit(OpBenchmark* benchmark); - void runOperationSuit(ScalarBenchmark *op, const std::function& func, const char *message = nullptr); - void runOperationSuit(TransformBenchmark *op, const std::function& func, const char *message = nullptr); - void runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message = nullptr); - void runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message = nullptr); - void runOperationSuit(PairwiseBenchmark *op, const std::function& func, const char *message = nullptr); + std::string runOperationSuit(ScalarBenchmark *op, const std::function& func, const char *message = nullptr); + std::string runOperationSuit(TransformBenchmark *op, const std::function& func, const char *message = nullptr); + std::string runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message = nullptr); + std::string runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message = nullptr); + std::string runOperationSuit(PairwiseBenchmark *op, const std::function& func, const char *message = nullptr); - void runOperationSuit(TransformBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(ScalarBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(BroadcastBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(PairwiseBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(MatrixBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(TransformBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(ScalarBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(BroadcastBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(PairwiseBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); + std::string runOperationSuit(MatrixBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - void runOperationSuit(DeclarableBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); - - - void runScalarSuit(); - - void runAllSuits(); + std::string runOperationSuit(DeclarableBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message = nullptr); }; } diff --git a/libnd4j/include/helpers/ConstantHelper.h b/libnd4j/include/helpers/ConstantHelper.h index 3f31439c9..a7f7d0c00 100644 --- a/libnd4j/include/helpers/ConstantHelper.h +++ b/libnd4j/include/helpers/ConstantHelper.h @@ -44,6 +44,8 @@ namespace nd4j { std::vector _devicePointers; std::vector _deviceOffsets; std::mutex _mutex; + + std::vector _counters; public: ~ConstantHelper() = default; @@ -53,6 +55,8 @@ namespace nd4j { void* replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace = nullptr); ConstantDataBuffer* constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType); + + Nd4jLong getCachedAmount(int deviceId); }; } diff --git a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h index 717623647..0aa8c35a6 100644 --- a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h +++ b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h @@ -36,7 +36,7 @@ namespace nd4j { nd4j::graph::Context *_context = nullptr; public: DeclarableBenchmark(nd4j::ops::DeclarableOp &op, std::string name = 0) : OpBenchmark() { - _op = ops::OpRegistrator::getInstance()->getOperation(op.getOpHash()); + _op = &op; //ops::OpRegistrator::getInstance()->getOperation(op.getOpHash()); _testName = name; } diff --git a/libnd4j/include/helpers/cpu/ConstantHelper.cpp b/libnd4j/include/helpers/cpu/ConstantHelper.cpp index 008e43bfe..f74bd5637 100644 --- a/libnd4j/include/helpers/cpu/ConstantHelper.cpp +++ b/libnd4j/include/helpers/cpu/ConstantHelper.cpp @@ -30,9 +30,11 @@ namespace nd4j { ConstantHelper::ConstantHelper() { int numDevices = getNumberOfDevices(); _cache.resize(numDevices); + _counters.resize(numDevices); for (int e = 0; e < numDevices; e++) { std::map map; _cache[e] = map; + _counters[e] = 0L; } } @@ -44,8 +46,14 @@ namespace nd4j { } void* ConstantHelper::replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace) { + if (workspace == nullptr) { + auto deviceId = getCurrentDevice(); + _counters[deviceId] += numBytes; + } + int8_t *ptr = nullptr; ALLOCATE(ptr, workspace, numBytes, int8_t); + std::memcpy(ptr, src, numBytes); return ptr; } @@ -71,7 +79,9 @@ namespace nd4j { if (holder->hasBuffer(dataType)) return holder->getConstantDataBuffer(dataType); else { - int8_t *cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)]; + auto size = descriptor.length() * DataTypeUtils::sizeOf(dataType); + auto cbuff = new int8_t[size]; + _counters[deviceId] += size; // create buffer with this dtype if (descriptor.isFloat()) { @@ -87,6 +97,14 @@ namespace nd4j { } } + Nd4jLong ConstantHelper::getCachedAmount(int deviceId) { + int numDevices = getNumberOfDevices(); + if (deviceId > numDevices || deviceId < 0) + return 0L; + else + return _counters[deviceId]; + } + nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0; } diff --git a/libnd4j/include/helpers/cuda/ConstantHelper.cu b/libnd4j/include/helpers/cuda/ConstantHelper.cu index cb96630a9..d0579b66d 100644 --- a/libnd4j/include/helpers/cuda/ConstantHelper.cu +++ b/libnd4j/include/helpers/cuda/ConstantHelper.cu @@ -70,6 +70,7 @@ namespace nd4j { _devicePointers.resize(numDevices); _deviceOffsets.resize(numDevices); _cache.resize(numDevices); + _counters.resize(numDevices); // filling all pointers for (int e = 0; e < numDevices; e++) { @@ -83,6 +84,7 @@ namespace nd4j { _devicePointers[e] = constant; _deviceOffsets[e] = 0; _cache[e] = devCache; + _counters[e] = 0L; } // @@ -115,6 +117,7 @@ namespace nd4j { constantPtr = _devicePointers[deviceId]; constantOffset = _deviceOffsets[deviceId]; } + if (constantOffset + numBytes >= CONSTANT_LIMIT) { int8_t *ptr = nullptr; ALLOCATE_SPECIAL(ptr, workspace, numBytes, int8_t); @@ -154,7 +157,9 @@ namespace nd4j { if (holder->hasBuffer(dataType)) { return holder->getConstantDataBuffer(dataType); } else { - auto cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)]; + auto numBytes = descriptor.length() * DataTypeUtils::sizeOf(dataType); + auto cbuff = new int8_t[numBytes]; + _counters[deviceId] += numBytes; // create buffer with this dtype if (descriptor.isFloat()) { @@ -172,5 +177,13 @@ namespace nd4j { } } + Nd4jLong ConstantHelper::getCachedAmount(int deviceId) { + int numDevices = getNumberOfDevices(); + if (deviceId > numDevices || deviceId < 0) + return 0L; + else + return _counters[deviceId]; + } + nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0; } \ No newline at end of file diff --git a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp index e92c7220f..cbe0c0729 100644 --- a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp +++ b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp @@ -30,11 +30,11 @@ namespace nd4j { _rIterations = runIterations; } - void BenchmarkHelper::printHeader() { - nd4j_printf("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n",""); + std::string BenchmarkHelper::printHeader() { + return std::string("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n"); } - void BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) { + std::string BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) { for (uint i = 0; i < _wIterations; i++) benchmark.executeOnce(); @@ -57,9 +57,9 @@ namespace nd4j { std::sort(timings.begin(), timings.end()); Nd4jLong median = timings[_rIterations / 2]; - NDArray n = NDArrayFactory::create(timings, LaunchContext::defaultContext()); + auto n = NDArrayFactory::create(timings, LaunchContext::defaultContext()); - double stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e(0); + auto stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e(0); auto min = n.reduceNumber(nd4j::reduce::Min).e(0); auto max = n.reduceNumber(nd4j::reduce::Max).e(0); @@ -71,10 +71,16 @@ namespace nd4j { auto a = benchmark.axis(); auto inpl = benchmark.inplace(); + std::string temp; + temp.resize(65536); + // printing out stuff - nd4j_printf("%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(), + snprintf(const_cast(temp.data()), temp.length(), "%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(), _wIterations, _rIterations, t.c_str(), inpl.c_str(), s.c_str(), strides.c_str(), a.c_str(), o.c_str(), nd4j::math::nd4j_floor(sumT), median, min, max, stdev); + + auto pos = temp.find('\n'); + return temp.substr(0, pos + 1); } void BenchmarkHelper::benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z) { @@ -126,47 +132,44 @@ namespace nd4j { nd4j::math::nd4j_floor(sumT), median, min, max, stdev); } - void BenchmarkHelper::runOperationSuit(std::initializer_list benchmarks, const char *msg) { + std::string BenchmarkHelper::runOperationSuit(std::initializer_list benchmarks, const char *msg) { std::vector ops(benchmarks); - runOperationSuit(ops, msg); + return runOperationSuit(ops, msg); } - void BenchmarkHelper::runOperationSuit(std::vector &benchmarks, bool postHeaders, const char *msg) { + std::string BenchmarkHelper::runOperationSuit(OpBenchmark* benchmark) { + return benchmarkOperation(*benchmark); + } + + std::string BenchmarkHelper::runOperationSuit(std::vector &benchmarks, bool postHeaders, const char *msg) { + std::string result; + if (msg != nullptr && postHeaders) { - nd4j_printf("\n%s\n", msg); + result += "\n"; + result += msg; + result += "\n"; } if (postHeaders) - printHeader(); + result += printHeader(); for (auto v:benchmarks) - benchmarkOperation(*v); + result += benchmarkOperation(*v); + + return result; } - void BenchmarkHelper::runScalarSuit() { - printHeader(); - - std::initializer_list> shapes = {{100}, {32, 256}, {32, 150, 200}, {32, 3, 244, 244}, {32, 64, 128, 256}}; - std::initializer_list dataTypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE}; - std::initializer_list ops = {scalar::Add, scalar::Divide, scalar::Pow}; - - for (const auto &d:dataTypes) { - for (const auto &o:ops) { - for (const auto &s:shapes) { - //benchmarkScalarOperation(o, 2.0, s, d); - } - } - } - } - - void BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string result; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + result += "\n"; + result += message; + result += "\n"; } - printHeader(); + result += printHeader(); std::vector list; @@ -175,25 +178,26 @@ namespace nd4j { auto clone = reinterpret_cast(op->clone()); clone->setContext(ctx); - list.emplace_back(clone); + + result += runOperationSuit(clone); + + delete clone; } - runOperationSuit(list, false); - - // removing everything - for (auto v:list) { - delete reinterpret_cast(v); - } + return result; } - void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -217,16 +221,20 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function& func, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function& func, const char *message) { + std::string output; + ResultSet x; x.setNonRemovable(); ResultSet z; @@ -248,23 +256,27 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, message); + output += runOperationSuit(result, message); // removing everything for (auto v:result) { delete reinterpret_cast(v); } + + return output; } - void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { - + std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -288,16 +300,20 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function& func, const char *message) { + std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function& func, const char *message) { + std::string output; + ResultSet x; x.setNonRemovable(); ResultSet z; @@ -319,22 +335,27 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, message); + output += runOperationSuit(result, message); // removing everything for (auto v:result) { delete reinterpret_cast(v); } + + return output; } - void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string output; auto parameters = parametersBatch.parameters(); if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -358,16 +379,19 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message) { + std::string output; ResultSet x; x.setNonRemovable(); ResultSet z; @@ -389,19 +413,24 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, message); + output += runOperationSuit(result, message); // removing everything for (auto v:result) { delete reinterpret_cast(v); } + + return output; } - void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } printHeader(); @@ -436,16 +465,20 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message) { + std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function& func, const char *message) { + std::string output; + ResultSet x; x.setNonRemovable(); ResultSet y; @@ -474,22 +507,27 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, message); + output += runOperationSuit(result, message); // removing everything for (auto v:result) { delete reinterpret_cast(v); } + + return output; } - void BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -518,23 +556,28 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -562,16 +605,20 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } - void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function& func, const char *message) { + std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function& func, const char *message) { + std::string output; + ResultSet x; x.setNonRemovable(); ResultSet y; @@ -597,22 +644,27 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, message); + output += runOperationSuit(result, message); // removing everything for (auto v:result) { delete reinterpret_cast(v); } + + return output; } - void BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { + std::string BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function& func, ParametersBatch ¶metersBatch, const char *message) { auto parameters = parametersBatch.parameters(); + std::string output; if (message != nullptr) { - nd4j_printf("\n%s\n", message); + output += "\n"; + output += message; + output += "\n"; } - printHeader(); + output += printHeader(); for (auto &p: parameters) { ResultSet x; @@ -637,12 +689,14 @@ namespace nd4j { result.emplace_back(clone); } - runOperationSuit(result, false); + output += runOperationSuit(result, false); // removing everything for (auto v:result) { delete reinterpret_cast(v); } } + + return output; } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp index debca0053..4cb33d8d4 100644 --- a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp +++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp @@ -42,13 +42,13 @@ CUSTOM_OP_IMPL(lstmBlock, 9, 7, false, 2, 2) { auto b = INPUT_VARIABLE(8); // biases, [4*numUnits] auto i = OUTPUT_VARIABLE(0); // Output - input modulation gate activations [seqLen, bS, numUnits] - auto c = OUTPUT_VARIABLE(1); // Activations, cell state (pre tanh) [seqLen, bs, numUnits] + auto c = OUTPUT_VARIABLE(1); // Activations, cell state (pre tanh) [seqLen, bs, numUnits] auto f = OUTPUT_VARIABLE(2); // Output - forget gate activations [seqLen, bs, numUnits] auto o = OUTPUT_VARIABLE(3); // Output - output gate activations [seqLen, bs, numUnits] auto z = OUTPUT_VARIABLE(4); // Output - input gate activations [seqLen, bs, numUnits] auto h = OUTPUT_VARIABLE(5); // Cell state, post tanh [seqLen, bs, numUnits] auto y = OUTPUT_VARIABLE(6); // current cell output [seqLen, bS, numProj], time t - + const int peephole = INT_ARG(0); // if 1, provide peephole connections const int dataFormat = INT_ARG(1); // 0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]; 2=NTS=[mb,seqLen,size] const double forgetBias = T_ARG(0); @@ -117,7 +117,7 @@ DECLARE_SHAPE_FN(lstmBlock) { //7 outputs, all same shape/type return SHAPELIST(s1, s1, s1, s1, s1, s1, s1); -} +} } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index f2e4e77bc..1a43fb250 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* std::vector dimsOut(indices->rankOf()); std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1 const Nd4jLong numOfSubArrs = indices->lengthOf(); -PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) + PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) for(int i = 0; i < numOfSubArrs; ++i) { NDArray subArrOut = (*output)(i, dimsOut); NDArray subArrIn = (*input)(indices->e(i), {axis}); @@ -72,7 +72,7 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->eleme } else { // vector case const Nd4jLong numOfSubArrs = intArgs.size() - 1; -PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) + PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) for(int i = 0; i < numOfSubArrs; ++i) { NDArray subArrOut = (*output)(i, {axis}); NDArray subArrIn = (*input)(intArgs[i+1], {axis}); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index f0f5697d0..de4a0b08d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -171,15 +171,8 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast const int numUnits = cLast->sizeAt(1); //Concat inputs: [xt, yt-1]: concat([bs,nIn],[bs,nOut]) -> [bs, (nIn+nOut)] - nd4j::ops::concat concat; - Context cContext(119); auto concatOut = NDArrayFactory::create(xt->ordering(), {xt->sizeAt(0), xt->sizeAt(1) + yLast->sizeAt(1)}, xt->dataType(), xt->getContext()); - cContext.setInputArray(0, const_cast(xt), false); - cContext.setInputArray(1, const_cast(yLast), false); - cContext.setOutputArray(0, &concatOut, false); - cContext.getIArguments()->emplace_back(1); - - concat.execute(&cContext); + helpers::concat(xt->getContext(), {const_cast(xt), const_cast(yLast)}, concatOut, {1}); //NDArray* NDArrayFactory::create_( const char order, const std::vector &shape, nd4j::DataType dataType, nd4j::memory::Workspace* workspace) { std::vector shape = {bS, 4*numUnits}; diff --git a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp index d115f3fd0..da2175a36 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp @@ -45,10 +45,26 @@ namespace nd4j { const NDArray* iSeq, const NDArray* cSeq, const NDArray* fSeq, const NDArray* oSeq, const NDArray* zSeq, const NDArray* hSeq, const NDArray* ySeq, const std::vector& params, const int dataFormat){ - const int seqLen = xSeq->sizeAt(0); - const int mb = xSeq->sizeAt(1); - const int inSize = xSeq->sizeAt(2); - const int outSize = iSeq->sizeAt(2); + int seqLen, mb, inSize, outSize; + + if(dataFormat == 0) { + seqLen = xSeq->sizeAt(0); + mb = xSeq->sizeAt(1); + inSize = xSeq->sizeAt(2); + outSize = iSeq->sizeAt(2); + } + else if(dataFormat == 1) { + seqLen = xSeq->sizeAt(2); + mb = xSeq->sizeAt(0); + inSize = xSeq->sizeAt(1); + outSize = iSeq->sizeAt(1); + } + else if(dataFormat == 2) { + seqLen = xSeq->sizeAt(1); + mb = xSeq->sizeAt(0); + inSize = xSeq->sizeAt(2); + outSize = iSeq->sizeAt(2); + } const std::vector inSliceShape({mb,inSize}); const std::vector outSliceShape({mb,outSize}); diff --git a/libnd4j/include/performance/benchmarking/BenchmarkSuit.h b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h new file mode 100644 index 000000000..1a77dbd9f --- /dev/null +++ b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h @@ -0,0 +1,41 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef LIBND4J_BENCHMARKSUIT_H +#define LIBND4J_BENCHMARKSUIT_H + +#include +#include +#include +#include +#include + +namespace nd4j { + class ND4J_EXPORT BenchmarkSuit { + public: + BenchmarkSuit() = default; + ~BenchmarkSuit() = default; + + virtual std::string runSuit() = 0; + }; +} + + +#endif //DEV_TESTS_BENCHMARKSUIT_H diff --git a/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h new file mode 100644 index 000000000..dc2b63a4d --- /dev/null +++ b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef LIBND4J_FULLBENCHMARKSUIT_H +#define LIBND4J_FULLBENCHMARKSUIT_H + +#include + +namespace nd4j { + class FullBenchmarkSuit : public BenchmarkSuit { + public: + std::string runSuit() override; + }; +} + + +#endif //DEV_TESTS_FULLBENCHMARKSUIT_H diff --git a/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h new file mode 100644 index 000000000..35215d032 --- /dev/null +++ b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef LIBND4J_LIGHTBENCHMARKSUIT_H +#define LIBND4J_LIGHTBENCHMARKSUIT_H + +#include + +namespace nd4j { + class LightBenchmarkSuit : public BenchmarkSuit { + public: + std::string runSuit() override; + }; +} + + +#endif //DEV_TESTS_LIGHTBENCHMARKSUIT_H diff --git a/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp new file mode 100644 index 000000000..902480092 --- /dev/null +++ b/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp @@ -0,0 +1,20 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// +#include diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp new file mode 100644 index 000000000..40ecb6214 --- /dev/null +++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp @@ -0,0 +1,1921 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +#ifdef _RELEASE + int wIterations = 4; + int rIterations = 20; + int gemmRegularUpperPow = 11; + int scalarBenchmarkPowLimit = 26; + int transformBenchmarkPowLimit = 26; + int intermediateTransformPowLimit = 22; + int intermediateTransformPowLimit2 = 18; + int pairwisePowLimit = 26; + int heavyPowLimit = 22; + int nonEwsPowLimit = 10; + int reduceScalarPowLimit = 26; + int stridedReductionPowLimit = 20; + int mismatchedAssignPowLimit = 26; + int gatherOpPowLimit = 18; + int gatherOpPowLimit2 = 16; + int gatherOpPowLimit3 = 12; + int broadcastMatrixRankLimit = 5; + int limit30 = 30; + int limit26 = 26; + int limit24 = 24; + int limit22 = 22; + int limit20 = 20; + int limit18 = 18; + int limit10 = 10; + int limit5 = 5; + int limit3 = 3; +#else + int wIterations = 0; + int rIterations = 1; + int gemmRegularUpperPow = 7; + int scalarBenchmarkPowLimit = 10; + int transformBenchmarkPowLimit = 10; + int intermediateTransformPowLimit = 10; + int intermediateTransformPowLimit2 = 10; + int pairwisePowLimit = 10; + int heavyPowLimit = 10; + int nonEwsPowLimit = 6; + int reduceScalarPowLimit = 10; + int stridedReductionPowLimit = 12; + int mismatchedAssignPowLimit = 2; + int gatherOpPowLimit = 10; + int gatherOpPowLimit2 = 8; + int gatherOpPowLimit3 = 8; + int broadcastMatrixRankLimit = 3; + int limit26 = 8; + int limit24 = 8; + int limit22 = 8; + int limit20 = 8; + int limit18 = 8; + int limit10 = 4; + int limit5 = 3; + int limit3 = 1; +#endif + +namespace nd4j { + + static std::string layerNormBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + BoolParameters nhwc("nhwc"); //0 = nchw + +#ifdef _RELEASE + int c = 32; + int hw = 64; +#else + int c = 3; + int hw = 8; +#endif + + ParametersBatch batch({&nhwc}); + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + + int axis; + if (n == 0) { + //nchw + auto input = NDArrayFactory::create_('c', {16, c, hw, hw}); + auto output = NDArrayFactory::create_('c', {16, c, hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + axis = 1; + } else { + auto input = NDArrayFactory::create_('c', {32, hw, hw, c}); + auto output = NDArrayFactory::create_('c', {32, hw, hw, c}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + axis = 3; + } + + auto bias = NDArrayFactory::create_('c', {c}); + ctx->setInputArray(1, bias, true); + auto iargs = new Nd4jLong[1]; + iargs[0] = axis; + ctx->setIArguments(iargs, 1); + delete[] iargs; + + return ctx; + }; + + nd4j::ops::layer_norm layerNorm; + DeclarableBenchmark benchmark(layerNorm, "layer norm"); + output += helper.runOperationSuit(&benchmark, generator, batch, "Layer Norm"); + + return output; + } + + + static std::string maxPool3DBenchmark(){ + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + BoolParameters ncdhw("ncdhw"); //1 = ndhwc + ParametersBatch batch({&ncdhw}); + + nd4j::ops::maxpool3dnew maxpool3Dnew; + DeclarableBenchmark benchmark(maxpool3Dnew, "maxPool3d"); + +#ifdef _RELEASE + int mb = 16; + int chIn = 16; + int chOut = 16; + int dhw = 64; +#else + int mb = 1; + int chIn = 3; + int chOut = 3; + int dhw = 16; +#endif + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int format = p.getIntParam("ncdhw"); + + //Set inputs and outputs + //Same mode + stride 1: output is same shape as input + if(format == 1) { + //NDHWC + ctx->setInputArray(0, NDArrayFactory::create_('c', {mb, dhw, dhw, dhw, chIn}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {mb, dhw, dhw, dhw, chIn}), true); + } else { + //NCDHW + ctx->setInputArray(0, NDArrayFactory::create_('c', {mb, chIn, dhw, dhw, dhw}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {mb, chIn, dhw, dhw, dhw}), true); + } + + auto iargs = new Nd4jLong[15]; + //Kernel, strides, padding, dilation - x3 each + iargs[0] = 3; //Kernel + iargs[1] = 3; + iargs[2] = 3; + iargs[3] = 1; //Stride + iargs[4] = 1; + iargs[5] = 1; + iargs[6] = 0; //Padding + iargs[7] = 0; + iargs[8] = 0; + iargs[9] = 1; //Dilation + iargs[10] = 1; + iargs[11] = 1; + iargs[12] = 1; //Same mode + iargs[13] = 0; //Unused for max + iargs[14] = format; //0 = ncdhw + ctx->setIArguments(iargs, 14); + delete[] iargs; + + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "maxPool3d"); + return output; + } + + + static std::string conv3dBenchmark(){ + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + BoolParameters ncdhw("ncdhw"); //1 = ndhwc + ParametersBatch batch({&ncdhw}); + + nd4j::ops::conv3dnew conv3Dnew; + DeclarableBenchmark benchmark(conv3Dnew, "conv3d"); + +#ifdef _RELEASE + int mb = 16; + int chIn = 16; + int chOut = 16; + int dhw = 64; +#else + int mb = 1; + int chIn = 3; + int chOut = 3; + int dhw = 16; +#endif + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int format = p.getIntParam("ncdhw"); + + //Set inputs and outputs + //Same mode + stride 1: output is same shape as input + if(format == 1) { + //NDHWC + ctx->setInputArray(0, NDArrayFactory::create_('c', {mb, dhw, dhw, dhw, chIn}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {mb, dhw, dhw, dhw, chIn}), true); + } else { + //NCDHW + ctx->setInputArray(0, NDArrayFactory::create_('c', {mb, chIn, dhw, dhw, dhw}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {mb, chIn, dhw, dhw, dhw}), true); + } + + //Weights and bias: + ctx->setInputArray(1, NDArrayFactory::create_('c', {3, 3, 3, chIn, chOut}), true); + ctx->setInputArray(2, NDArrayFactory::create_('c', {chOut}), true); + + + auto iargs = new Nd4jLong[14]; + //Kernel, strides, padding, dilation - x3 each + iargs[0] = 3; //Kernel + iargs[1] = 3; + iargs[2] = 3; + iargs[3] = 1; //Stride + iargs[4] = 1; + iargs[5] = 1; + iargs[6] = 0; //Padding + iargs[7] = 0; + iargs[8] = 0; + iargs[9] = 1; //Dilation + iargs[10] = 1; + iargs[11] = 1; + iargs[12] = 1; //Same mode + iargs[13] = format; //0 = ncdhw + ctx->setIArguments(iargs, 14); + delete[] iargs; + + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "CNN3D"); + return output; + } + + + static std::string lstmBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + BoolParameters format("format"); //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen] +#ifdef _RELEASE + PredefinedParameters mb("mb", {1, 8, 64}); + PredefinedParameters nInOut("nInOut", {32, 256, 1024}); +#else + PredefinedParameters mb("mb", {1}); + PredefinedParameters nInOut("nInOut", {32}); +#endif + + ParametersBatch batch({&format, &mb, &nInOut}); + nd4j::ops::lstmBlock lstmBlock; + DeclarableBenchmark benchmark(lstmBlock, "lstm"); + + int seqLength = 32; + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int f = p.getIntParam("format"); + int m = p.getIntParam("mb"); + int n = p.getIntParam("nInOut"); + + Nd4jLong l = 0; + ctx->setInputArray(0, NDArrayFactory::create_(l), true); //Max TS length (unused) + + + if (f == 0) { + //TNS format + ctx->setInputArray(1, NDArrayFactory::create_('c', {seqLength, m, n}), true); //x + ctx->setOutputArray(0, NDArrayFactory::create_('c', {seqLength, m, n}), true); //i + ctx->setOutputArray(1, NDArrayFactory::create_('c', {seqLength, m, n}), true); //c + ctx->setOutputArray(2, NDArrayFactory::create_('c', {seqLength, m, n}), true); //f + ctx->setOutputArray(3, NDArrayFactory::create_('c', {seqLength, m, n}), true); //o + ctx->setOutputArray(4, NDArrayFactory::create_('c', {seqLength, m, n}), true); //z + ctx->setOutputArray(5, NDArrayFactory::create_('c', {seqLength, m, n}), true); //h + ctx->setOutputArray(6, NDArrayFactory::create_('c', {seqLength, m, n}), true); //y + } else { + //NST format + ctx->setInputArray(1, NDArrayFactory::create_('f', {m, n, seqLength}), true); //x + ctx->setOutputArray(0, NDArrayFactory::create_('f', {m, n, seqLength}), true); //i + ctx->setOutputArray(1, NDArrayFactory::create_('f', {m, n, seqLength}), true); //c + ctx->setOutputArray(2, NDArrayFactory::create_('f', {m, n, seqLength}), true); //f + ctx->setOutputArray(3, NDArrayFactory::create_('f', {m, n, seqLength}), true); //o + ctx->setOutputArray(4, NDArrayFactory::create_('f', {m, n, seqLength}), true); //z + ctx->setOutputArray(5, NDArrayFactory::create_('f', {m, n, seqLength}), true); //h + ctx->setOutputArray(6, NDArrayFactory::create_('f', {m, n, seqLength}), true); //y + } + + auto cLast = NDArrayFactory::create_('c', {m, n}); + auto yLast = NDArrayFactory::create_('c', {m, n}); + auto W = NDArrayFactory::create_('c', {2 * n, 4 * n}); + auto Wci = NDArrayFactory::create_('c', {n}); + auto Wcf = NDArrayFactory::create_('c', {n}); + auto Wco = NDArrayFactory::create_('c', {n}); + auto b = NDArrayFactory::create_('c', {4 * n}); + + ctx->setInputArray(2, cLast, true); + ctx->setInputArray(3, yLast, true); + ctx->setInputArray(4, W, true); + ctx->setInputArray(5, Wci, true); + ctx->setInputArray(6, Wcf, true); + ctx->setInputArray(7, Wco, true); + ctx->setInputArray(8, b, true); + + auto iargs = new Nd4jLong[2]; + iargs[0] = 0; //No peephole + iargs[1] = f; + ctx->setIArguments(iargs, 2); + delete[] iargs; + + auto targs = new double[2]; + targs[0] = 1.0; //forget bias + targs[1] = 0.0; //cell clipping value + ctx->setTArguments(targs, 2); + delete[] targs; + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock"); + return output; + } + + static std::string batchnormBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Convolution2D op + BoolParameters nhwc("nhwc"); +#ifdef _RELEASE + PredefinedParameters c("c", {3, 32, 128}); + PredefinedParameters hw("hw", {32, 128}); +#else + PredefinedParameters c("c", {3}); + PredefinedParameters hw("hw", {16}); +#endif + + ParametersBatch batch({&nhwc, &c, &hw}); + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + int hw = p.getIntParam("hw"); + int ch = p.getIntParam("c"); + + auto args = new Nd4jLong[3]; + args[0] = args[1] = 1; //apply scale and offset + if (n == 0) { + auto input = NDArrayFactory::create_('c', {32, ch, hw, hw}); + auto output = NDArrayFactory::create_('c', {32, ch, hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + args[2] = 1; //axis + } else { + auto input = NDArrayFactory::create_('c', {32, hw, hw, ch}); + auto output = NDArrayFactory::create_('c', {32, hw, hw, ch}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + args[2] = 3; //axis + } + ctx->setIArguments(args, 3); + delete[] args; + + ctx->setInputArray(1, NDArrayFactory::create_('c', {ch}), true); //mean + auto v = NDArrayFactory::create_('c', {ch}); + v->assign(1.0f); + ctx->setInputArray(2, v, true); //variance + auto g = NDArrayFactory::create_('c', {ch}); + g->assign(1.0); + ctx->setInputArray(3, g, true); //gamma + auto b = NDArrayFactory::create_('c', {ch}); + b->assign(1.0); + ctx->setInputArray(4, b, true); //beta + + auto targs = new double[1]; + targs[0] = 1e-5; + ctx->setTArguments(targs, 1); + delete[] targs; + + return ctx; + }; + + nd4j::ops::batchnorm_new batchnorm; + DeclarableBenchmark benchmark(batchnorm, "batchnorm"); + output += helper.runOperationSuit(&benchmark, generator, batch, "Batch Normalization"); + + return output; + } + + static std::string pool2dBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Convolution2D op + BoolParameters nhwc("nhwc"); +#ifdef _RELEASE + PredefinedParameters k("k", {2, 3, 5}); + PredefinedParameters c("c", {3, 32, 128}); + PredefinedParameters hw("hw", {32, 128}); +#else + PredefinedParameters k("k", {2}); + PredefinedParameters c("c", {3}); + PredefinedParameters hw("hw", {8}); +#endif + + ParametersBatch batch({&nhwc, &k, &c, &hw}); + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + int hw = p.getIntParam("hw"); + int khw = p.getIntParam("k"); + + if (n == 0) { + auto input = NDArrayFactory::create_('c', {32, p.getIntParam("c"), hw, hw}); + auto output = NDArrayFactory::create_('c', {32, p.getIntParam("c"), hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } else { + auto input = NDArrayFactory::create_('c', {32, hw, hw, p.getIntParam("c")}); + auto output = NDArrayFactory::create_('c', {32, hw, hw, p.getIntParam("c")}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } + + auto args = new Nd4jLong[11]; + args[0] = args[1] = khw; //Kernel + args[2] = args[3] = 1;//Stride + args[4] = args[5] = 0; //Pad + args[6] = args[7] = 1; //Dilation + args[8] = 1; //SAME + args[9] = 0; //Divisor mode - 0 = exclude padding in divisor + args[10] = n;//0-nchw, 1=nhwc + ctx->setIArguments(args, 11); + delete[] args; + + return ctx; + }; + + nd4j::ops::avgpool2d avgpool2d; + DeclarableBenchmark benchmark1(avgpool2d, "avgpool"); + output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pooling 2d Operation"); + + nd4j::ops::maxpool2d maxpool2d; + DeclarableBenchmark benchmark2(maxpool2d, "maxpool"); + output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pooling 2d Operation"); + return output; + } + + static std::string conv2dBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Convolution2D op + BoolParameters nhwc("nhwc"); +#ifdef _RELEASE + PredefinedParameters k("k", {2, 3, 5}); + PredefinedParameters c("c", {3, 32, 128}); + PredefinedParameters hw("hw", {32, 128}); +#else + PredefinedParameters k("k", {2}); + PredefinedParameters c("c", {3}); + PredefinedParameters hw("hw", {8}); +#endif + ParametersBatch batch({&nhwc, &k, &c, &hw}); + nd4j::ops::conv2d conv2d; + DeclarableBenchmark benchmark(conv2d, "conv2d"); + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + int hw = p.getIntParam("hw"); + int khw = p.getIntParam("k"); + + if (n == 0) { + auto input = NDArrayFactory::create_('c', {32, p.getIntParam("c"), hw, hw}); + auto output = NDArrayFactory::create_('c', {32, p.getIntParam("c"), hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } else { + auto input = NDArrayFactory::create_('c', {32, hw, hw, p.getIntParam("c")}); + auto output = NDArrayFactory::create_('c', {32, hw, hw, p.getIntParam("c")}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } + + auto b = NDArrayFactory::create_('c', {p.getIntParam("c")}); + auto w = NDArrayFactory::create_('c', {khw, khw, p.getIntParam("c"), p.getIntParam("c")}); // [kH, kW, iC, oC] always + + ctx->setInputArray(1, w, true); + ctx->setInputArray(2, b, true); + + auto args = new Nd4jLong[10]; + args[0] = args[1] = khw; //Kernel + args[2] = args[3] = 1;//Stride + args[4] = args[5] = 0; //Pad + args[6] = args[7] = 1; //Dilation + args[8] = 1; //SAME + args[9] = n;//0-nchw, 1=nhwc + ctx->setIArguments(args, 10); + delete[] args; + + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d Operation"); + return output; + } + + static std::string rngBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + //Uniform, gaussian and bernoulli RNG generation + + IntPowerParameters length("length", 2, 4, scalarBenchmarkPowLimit, 3); //2^8 to 2^30 in steps of 3 + + ParametersBatch batch({&length}); + + auto gen01 = PARAMETRIC_D() { + auto ctx = new Context(1); + ctx->setInputArray(0, NDArrayFactory::create_('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray + ctx->setOutputArray(0, NDArrayFactory::create_('c', {1, p.getIntParam("length")}), true); + auto d = new double[2]; + d[0] = 0.0; + d[1] = 1.0; + ctx->setTArguments(d, 2); + delete[] d; + return ctx; + }; + + auto gen05 = PARAMETRIC_D() { + auto ctx = new Context(1); + ctx->setInputArray(0, NDArrayFactory::create_('c', {2},{1, p.getIntParam("length")}), true); //Shape as NDArray + ctx->setOutputArray(0, NDArrayFactory::create_('c', {1, p.getIntParam("length")}), true); + auto d = new double[1]; + d[0] = 0.5; + ctx->setTArguments(d, 1); + delete[] d; + return ctx; + }; + + nd4j::ops::LegacyRandomOp unif(random::UniformDistribution); + DeclarableBenchmark dbU(unif, "uniform"); + output += helper.runOperationSuit(&dbU, gen01, batch, "Uniform Distribution"); + + nd4j::ops::LegacyRandomOp gaussian(random::GaussianDistribution); + DeclarableBenchmark dbG(gaussian, "gaussian"); + output += helper.runOperationSuit(&dbG, gen01, batch, "Gaussian Distribution"); + + nd4j::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution); + DeclarableBenchmark dbTU(unif, "trunc.norm"); + output += helper.runOperationSuit(&dbTU, gen01, batch, "Truncated Normal Distribution"); + + nd4j::ops::LegacyRandomOp ln(random::LogNormalDistribution); + DeclarableBenchmark dbLN(ln, "uniform"); + output += helper.runOperationSuit(&dbLN, gen01, batch, "Log Normal Distribution"); + + nd4j::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution); + DeclarableBenchmark dbB(bernoulli, "bernoulli"); + output += helper.runOperationSuit(&dbB, gen05, batch, "Bernoulli Distribution"); + + nd4j::ops::LegacyRandomOp dropout(random::BernoulliDistribution); + DeclarableBenchmark dbD(dropout, "dropout"); + output += helper.runOperationSuit(&dbD, gen05, batch, "Dropout"); + + return output; + } + + static std::string gemmIrregularBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Basically the same as above, but with irregular shapes (not multiples of 8, etc) + +#ifdef _RELEASE + int tAMax = 1; + int tBMax = 1; + int b = 1024; + int c = 1024; +#else + int tAMax = 1; + int tBMax = 1; + int b = 32; + int c = 32; +#endif + + for (int tA = 0; tA <= tAMax; tA++) { + for (int tB = 0; tB <= tBMax; tB++) { + IntParameters d("d", 1020, 1028, 1); //1020, 1021, ..., 1028 + ParametersBatch dim({&d}); + + //Vary A.rows: + auto generator = PARAMETRIC_XYZ() { + auto a = p.getIntParam("d"); + std::vector shapeA; + std::vector shapeB; + if (tA) { + shapeA = {b, a}; + } else { + shapeA = {a, b}; + } + if (tB) { + shapeB = {c, b}; + } else { + shapeB = {b, c}; + } + auto A = NDArrayFactory::create_('c', shapeA); + auto B = NDArrayFactory::create_('c', shapeB); + auto C = NDArrayFactory::create_('f', {a, c}); + + x.push_back(A); + y.push_back(B); + z.push_back(C); + }; + + std::string n; + n += "Gemm (a.rows) - tA="; + n += std::to_string(tA); + n += ", tB="; + n += std::to_string(tB); + + MatrixBenchmark mb(1.0, 0.0, tA, tB, n); + + output += helper.runOperationSuit(&mb, generator, dim, n.c_str()); + + //Vary A.columns / B.rows + auto generator2 = PARAMETRIC_XYZ() { + auto a = 1024; + auto b = p.getIntParam("d"); + auto c = 1024; + std::vector shapeA; + std::vector shapeB; + if (tA) { + shapeA = {b, a}; + } else { + shapeA = {a, b}; + } + if (tB) { + shapeB = {c, b}; + } else { + shapeB = {b, c}; + } + auto A = NDArrayFactory::create_('c', shapeA); + auto B = NDArrayFactory::create_('c', shapeB); + auto C = NDArrayFactory::create_('f', {a, c}); + + x.push_back(A); + y.push_back(B); + z.push_back(C); + }; + + std::string n2; + n2 += "Gemm (a.columns) - tA="; + n2 += std::to_string(tA); + n2 += ", tB="; + n2 += std::to_string(tB); + + MatrixBenchmark mb2(1.0, 0.0, tA, tB, n2); + + output += helper.runOperationSuit(&mb2, generator2, dim, n2.c_str()); + + //Vary A.columns / B.rows + auto generator3 = PARAMETRIC_XYZ() { + auto a = 1024; + auto b = 1024; + auto c = p.getIntParam("d"); + std::vector shapeA; + std::vector shapeB; + if (tA) { + shapeA = {b, a}; + } else { + shapeA = {a, b}; + } + if (tB) { + shapeB = {c, b}; + } else { + shapeB = {b, c}; + } + auto A = NDArrayFactory::create_('c', shapeA); + auto B = NDArrayFactory::create_('c', shapeB); + auto C = NDArrayFactory::create_('f', {a, c}); + + x.push_back(A); + y.push_back(B); + z.push_back(C); + }; + + std::string n3; + n3 += "Gemm (b.columns) - tA="; + n3 += std::to_string(tA); + n3 += ", tB="; + n3 += std::to_string(tB); + + MatrixBenchmark mb3(1.0, 0.0, tA, tB, n); + + output += helper.runOperationSuit(&mb3, generator3, dim, n3.c_str()); + } + } + + return output; + } + + static std::string batchGemmBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Rank 3 - [32,1024,1024]x[32,1024,1024] + //Rank 4 - [4,8,1024,1024]x[4,8,1024,1024] + + IntParameters rank("rank", 3, 4, 1); + + ParametersBatch b({&rank}); + + auto generator = PARAMETRIC_D() { + auto rank = p.getIntParam("rank"); + std::vector shapeA; + std::vector shapeB; + auto ctx = new Context(1); + + if(rank == 3){ + ctx->setInputArray(0, NDArrayFactory::create_('c', {32, 1024, 1024}), true); + ctx->setInputArray(1, NDArrayFactory::create_('c', {32, 1024, 1024}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {32, 1024, 1024}), true); + } else { + ctx->setInputArray(0, NDArrayFactory::create_('c', {4, 8, 1024, 1024}), true); + ctx->setInputArray(1, NDArrayFactory::create_('c', {4, 8, 1024, 1024}), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {4, 8, 1024, 1024}), true); + } + + return ctx; + }; + + nd4j::ops::matmul mmul; + DeclarableBenchmark benchmark(mmul, "mmul (batch)"); + output += helper.runOperationSuit(&benchmark, generator, b, "MMul (batch)"); + + return output; + } + + static std::string gemmRegularBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + for (int o = 0; o <= 1; o++) { + char resultOrder = (o == 0 ? 'f' : 'c'); + for (int tA = 0; tA <= 1; tA++) { + for (int tB = 0; tB <= 1; tB++) { + + + IntPowerParameters pa("sz", 2, 7, gemmRegularUpperPow, 2); //2^7=128, 2^9=512, 2^11=2048 + + ParametersBatch b({&pa}); + + auto generator = PARAMETRIC_XYZ() { + auto s = p.getIntParam("sz"); + auto A = NDArrayFactory::create_('c', {s, s}); + auto B = NDArrayFactory::create_('c', {s, s}); + auto C = NDArrayFactory::create_(resultOrder, {s, s}); + + x.push_back(A); + y.push_back(B); + z.push_back(C); + }; + + std::string n; + n += "Gemm - tA="; + n += std::to_string(tA); + n += ", tB="; + n += std::to_string(tB); + n += ", cOrder="; + n += resultOrder; + + MatrixBenchmark mb(1.0, 0.0, tA == 0 ? false : true, tB == 0 ? false : true, n); + + output += helper.runOperationSuit(&mb, generator, b, n.c_str()); + } + } + } + + return output; + } + + static std::string scatterOpBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^26 in steps of 4 + ParametersBatch batch({&length}); + + //Gather 1D tests - 1d ref, 1d indices, 1d updates -> 1d output + nd4j::ops::scatter_upd scatter_update1; + DeclarableBenchmark sa1d(scatter_update1, "scatter_update1d"); + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int length = p.getIntParam("length"); + auto in = NDArrayFactory::create_('c', {length}); + auto indices = NDArrayFactory::create_('c', {length}); + auto updates = NDArrayFactory::create_('c', {length}); + + int* a = new int[length]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setInputArray(2, updates, true); + ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy! + ctx->markInplace(true); + return ctx; + }; + + output += helper.runOperationSuit(&sa1d, generator, batch, "Scatter Update - 1d"); + + //Gather 2D tests - 2d input, 1d indices, 2d updates -> 2d output + IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^16 in steps of 2: 2^10, ..., 2^20 + PredefinedParameters cols("cols", {32}); + ParametersBatch batch2({&rows, &cols}); + nd4j::ops::scatter_upd scatter_update2; + DeclarableBenchmark sa2d(scatter_update2, "scatter_update2d"); + auto generator2 = PARAMETRIC_D() { + auto ctx = new Context(1); + int rows = p.getIntParam("rows"); + int cols = p.getIntParam("cols"); + auto in = NDArrayFactory::create_('c', {rows, cols}); + auto indices = NDArrayFactory::create_('c', {rows}); + auto updates = NDArrayFactory::create_('c', {rows, cols}); + + int* a = new int[rows]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setInputArray(2, updates, true); + ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy! + ctx->markInplace(true); + return ctx; + }; + + output += helper.runOperationSuit(&sa2d, generator2, batch2, "Scatter Update - 2d"); + + //Gather 3D tests - 3d input, 1d indices -> 3d output + IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4); + PredefinedParameters sz1("sz1", {32}); + ParametersBatch batch3({&sz0, &sz1}); + nd4j::ops::scatter_upd scatter_update3; + DeclarableBenchmark sa3d(scatter_update3, "scatter3d"); + auto generator3 = PARAMETRIC_D() { + auto ctx = new Context(1); + int sz0 = p.getIntParam("sz0"); + int sz1 = p.getIntParam("sz1"); + auto in = NDArrayFactory::create_('c', {sz0, sz1, 512/sz1}); + auto indices = NDArrayFactory::create_('c', {sz0}); + auto updates = NDArrayFactory::create_('c', {sz0, sz1, 512/sz1}); + + int* a = new int[sz0]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setInputArray(2, updates, true); + ctx->setOutputArray(0, in); //Needs to be inplace to avoid copy! + ctx->markInplace(true); + return ctx; + }; + + output += helper.runOperationSuit(&sa3d, generator3, batch3, "Scatter Update - 3d"); + return output; + } + + static std::string gatherOpBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4); //2^10 to 2^22 in steps of 4 + ParametersBatch batch({&length}); + + //Gather 1D tests - 1d input, 1d indices -> 1d output + nd4j::ops::gather gather1; + DeclarableBenchmark gather1d(gather1, "gather1d"); + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int length = p.getIntParam("length"); + auto in = NDArrayFactory::create_('c', {length}); + auto indices = NDArrayFactory::create_('c', {length}); + int* a = new int[length]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {length}), true); + return ctx; + }; + + output += helper.runOperationSuit(&gather1d, generator, batch, "Gather - 1d"); + + //Gather 2D tests - 2d input, 1d indices -> 2d output + IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4); //2^10 to 2^20 in steps of 2: 2^10, ..., 2^20 + PredefinedParameters cols("cols", {32}); + ParametersBatch batch2({&rows, &cols}); + nd4j::ops::gather gather2; + DeclarableBenchmark gather2d(gather2, "gather2d"); + auto generator2 = PARAMETRIC_D() { + auto ctx = new Context(1); + int rows = p.getIntParam("rows"); + int cols = p.getIntParam("cols"); + auto in = NDArrayFactory::create_('c', {rows, cols}); + auto indices = NDArrayFactory::create_('c', {rows}); + + int* a = new int[rows]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {rows, cols}), true); + return ctx; + }; + + output += helper.runOperationSuit(&gather2d, generator2, batch2, "Gather - 2d"); + + //Gather 3D tests - 3d input, 1d indices -> 3d output + IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4); //2^8 to 2^16 in steps of 4 + PredefinedParameters sz1("sz1", {32}); + ParametersBatch batch3({&sz0, &sz1}); + nd4j::ops::gather gather3; + DeclarableBenchmark gather3d(gather3, "gather3d"); + auto generator3 = PARAMETRIC_D() { + auto ctx = new Context(1); + int sz0 = p.getIntParam("sz0"); + int sz1 = p.getIntParam("sz1"); + auto in = NDArrayFactory::create_('c', {sz0, sz1, 512/sz1}); + auto indices = NDArrayFactory::create_('c', {sz0}); + + int* a = new int[sz0]; + for( int i=0; ip(i, a[i]); + } + delete[] a; + + ctx->setInputArray(0, in, true); + ctx->setInputArray(1, indices, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {sz0, sz1, 512/sz1}), true); + return ctx; + }; + + output += helper.runOperationSuit(&gather3d, generator3, batch3, "Gather - 3d"); + + return output; + } + + static std::string mismatchedOrdersAssignBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters rows("rows", 2, 2, mismatchedAssignPowLimit, 4); //2^2 to 2^26 in steps of 2 - 2^1=2, ..., 2^26=67108864 + BoolParameters cf("cf"); + + ParametersBatch batch({&rows, &cf}); + + auto generator = PARAMETRIC_XZ() { + int numElements = 67108864; //2^26 + int rows = p.getIntParam("rows"); + int cols = numElements / rows; + bool c = p.getIntParam("cf"); + + auto arr = NDArrayFactory::create_(c ? 'c' : 'f', {rows, cols}); + auto arr2 = NDArrayFactory::create_(c ? 'f' : 'c', {rows, cols}); + x.push_back(arr); + z.push_back(arr2); + }; + + TransformBenchmark tb(transform::AnyOps::Assign, "assign"); + output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign"); + + //Also test: NCHW to NHWC and back + BoolParameters nchw("nchw"); + ParametersBatch batch2({&nchw}); + auto generator2 = PARAMETRIC_XZ() { + bool nchw = p.getIntParam("nchw"); + + if(nchw) { + auto orig = NDArrayFactory::create_('c', {16, 32, 64, 64}); + orig->permutei({0,2,3,1}); + x.push_back(orig); + z.push_back(NDArrayFactory::create_('c', {16, 64, 64, 32})); + } else { + auto orig = NDArrayFactory::create_('c', {16, 64, 64, 32}); + orig->permutei({0,3,1,2}); + x.push_back(orig); + z.push_back(NDArrayFactory::create_('c', {16, 32, 64, 64})); + } + }; + + TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw"); + output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign"); + return output; + } + + static std::string broadcastOpsMatrixBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Broadcast ops: matrices for rank 3, 4, 5 + for( int rank=3; rank <= broadcastMatrixRankLimit; rank++ ){ + int numAxisTests = -1; + if(rank == 3){ + numAxisTests = 3; + } else if(rank == 4){ + numAxisTests = 6; + } else if(rank == 5){ + numAxisTests = 10; + } + + IntParameters testNum("testNum", 0,numAxisTests-1,1); + ParametersBatch b({&testNum}); + + auto generator = PARAMETRIC_D(){ + int n = p.getIntParam("testNum"); + std::vector axis({}); + switch(n){ + //rank 3+ + case 0: + axis = std::vector({0,1}); + break; + case 1: + axis = std::vector({0,2}); + break; + case 2: + axis = std::vector({1,2}); + break; + //rank 4+ + case 3: + axis = std::vector({0,3}); + break; + case 4: + axis = std::vector({1,3}); + break; + case 5: + axis = std::vector({2,3}); + break; + //Rank 5 + case 6: + axis = std::vector({0,4}); + break; + case 7: + axis = std::vector({1,4}); + break; + case 8: + axis = std::vector({2,4}); + break; + case 9: + axis = std::vector({3,4}); + break; + } + + + std::vector shape({}); + std::vector toBcShape({}); + int vectorLength; + if(rank == 3){ + shape = std::vector({64,64,64}); + toBcShape = std::vector({64,64,64}); + vectorLength = 64; + } else if(rank == 4){ + shape = std::vector({32,32,32,32}); + toBcShape = std::vector({32,32,32,32}); + vectorLength = 32; + } else if(rank == 5){ + shape = std::vector({16,16,16,16,16}); + toBcShape = std::vector({16,16,16,16,16}); + vectorLength = 16; + } + + for( int i=0; isetInputArray(0, NDArrayFactory::create_('c', shape), true); + ctx->setInputArray(1, NDArrayFactory::create_('c', toBcShape), true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', shape), true); + return ctx; + }; + + std::string name; + name += "Broadcast Matrix Add (Custom) - Rank"; + name += std::to_string(rank); + + nd4j::ops::add op; + DeclarableBenchmark benchmark(op, "add"); + output += helper.runOperationSuit(&benchmark, generator, b, name.c_str()); + } + + return output; + } + + + static std::string broadcast2dBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + PredefinedParameters rows("rows", {65536}); + IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^2, 2^6, 2^10 + BoolParameters axis("axis"); + BoolParameters inplace("inplace"); + + ParametersBatch batch({&rows, &cols, &axis, &inplace}); + + auto generator = PARAMETRIC_D() { + auto a = p.getIntParam("axis"); + auto arr = NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")}); + + auto ctx = new Context(1); + ctx->setInputArray(0, arr, true); + if(a == 0){ + ctx->setInputArray(1, NDArrayFactory::create_('c', {p.getIntParam("rows"), 1}), true); + } else { + ctx->setInputArray(1, NDArrayFactory::create_('c', {1, p.getIntParam("cols")}), true); + } + if (p.getIntParam("inplace") == 1) { + ctx->setOutputArray(0, arr); + ctx->markInplace(true); + } else { + ctx->setOutputArray(0, NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")}), true); + } + return ctx; + }; + + std::string s("add"); + nd4j::ops::add op; + DeclarableBenchmark benchmark(op, "add"); + output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d"); + return output; + } + + static std::string broadcastBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + //Broadcast ops: vectors for rank 2, 3, 4, 5 + for( int axis=0; axis<=1; axis++ ){ + PredefinedParameters rows("rows", {65536}); + IntPowerParameters cols("cols", 2, 2, limit10, 4); //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&rows, &cols, &inplace}); + + auto generator = PARAMETRIC_XYZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")}); + x.push_back(arr); + if(axis == 0){ + y.push_back(NDArrayFactory::create_('c', {p.getIntParam("rows")})); + } else { + y.push_back(NDArrayFactory::create_('c', {p.getIntParam("cols")})); + } + if (p.getIntParam("inplace") == 1) { + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")})); + } + }; + + std::string s("bAdd"); s += std::to_string(axis); s += "r2"; + BroadcastBenchmark bAdd(broadcast::Add, s, {axis}); + output += helper.runOperationSuit(&bAdd, generator, batch, "Broadcast Add - Rank 2"); + } + + for( int rank=3; rank<=5; rank++ ){ + for( int axis=1; axis shape({}); + int vectorLength; + if(rank == 3){ + shape = std::vector({32,128,128}); + vectorLength = 128; + } else if(rank == 4){ + shape = std::vector({16,64,64,64}); + vectorLength = 64; + } else if(rank == 5){ + shape = std::vector({16,48,48,48,48}); + vectorLength = 48; + } + + ParametersBatch batch({}); + + //Note: always inplace here + auto generator = PARAMETRIC_XYZ() { + auto arr = NDArrayFactory::create_('c', shape); + x.push_back(arr); + y.push_back(NDArrayFactory::create_('c', {vectorLength})); + z.push_back(arr); + }; + + std::string name("bArr-r"); name += std::to_string(rank); name += "a"; name += std::to_string(axis); + BroadcastBenchmark bAdd(broadcast::Add, name, {axis}); + std::string n2("Broadcast Add - Rank"); n2 += std::to_string(rank); n2 += " - axis="; n2 += std::to_string(axis); + output += helper.runOperationSuit(&bAdd, generator, batch, n2.c_str()); + } + } + + return output; + } + + static std::string fastStridedReductionNonEws() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters stride("stride", 2, 0, 10, 2); //2^0=1, ..., 2^10=1024 + + ParametersBatch batch({&stride}); + + //This is an edge case: technically an EWS *should* be available here + auto generator1 = PARAMETRIC_XYZ() { + auto stride = p.getIntParam("stride"); + auto arr = NDArrayFactory::create_('c', {131072 + (stride == 1 ? 0 : 1), stride}); + + NDArray* strided; + if(stride == 1){ + strided = arr; + } else { + IndicesList indices({NDIndex::interval(0,131072), NDIndex::interval(0,1)}); + strided = arr->subarray(indices); //All rows, first column + delete arr; + } + + strided->assign(1.0); + x.push_back(strided); + y.push_back(nullptr); + z.push_back(NDArrayFactory::create_(0.0f)); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum"); + output += helper.runOperationSuit(&rbSum, (const std::function)(generator1), batch, "Strided Sum - No EWS Test 1"); + + + //No EWS defined for this case + auto generator2 = PARAMETRIC_XYZ() { + auto stride = p.getIntParam("stride"); + auto arr = NDArrayFactory::create_('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride}); + + NDArray* strided; + if(stride == 1){ + strided = arr; + } else { + IndicesList indices({NDIndex::interval(0,2*1024,2), NDIndex::all(), NDIndex::interval(0,1)}); + strided = arr->subarray(indices); + delete arr; + } + + strided->assign(1.0); + x.push_back(strided); + y.push_back(nullptr); + z.push_back(NDArrayFactory::create_(0.0f)); + }; + + ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS"); + output += helper.runOperationSuit(&rbSum2, (const std::function)(generator2), batch, "Strided Sum - No EWS Test 2"); + + return output; + } + + static std::string fastStridedReductionIrregular() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4 + PredefinedParameters stride("stride", {26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028}); + + ParametersBatch batch({&length, &stride}); + + auto generator = PARAMETRIC_XYZ() { + auto stride = p.getIntParam("stride"); + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length"), stride}); + + NDArray* strided; + if(stride == 1){ + strided = arr; + } else { + IndicesList indices({NDIndex::all(), NDIndex::interval(0,1)}); + strided = arr->subarray(indices); //All rows, first column + delete arr; + } + + strided->assign(1.0); + x.push_back(strided); + y.push_back(nullptr); + z.push_back(NDArrayFactory::create_(0.0f)); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum"); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, "Strided Sum - Irregular Strides"); + + return output; + } + + static std::string fastStridedReductionsRegular() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4); //2^12 to 2^20 in steps of 4 + IntPowerParameters stride("stride", 2, 0, 10); //2^0=1, ..., 2^10=1024 + + ParametersBatch batch({&length, &stride}); + + auto generator = PARAMETRIC_XYZ() { + auto stride = p.getIntParam("stride"); + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length"), stride}); + + NDArray* strided; + if(stride == 1){ + strided = arr; + } else { + IndicesList indices({NDIndex::all(), NDIndex::point(0)}); + strided = arr->subarray(indices); //All rows, first column + delete arr; + } + + strided->assign(1.0); + x.push_back(strided); + y.push_back(nullptr); +// z.push_back(NDArrayFactory::create_(0.0f)); + z.push_back(NDArrayFactory::create_('c', {1})); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "Strided Sum"); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, "Strided Sum - Regular Strides (powers of 2)"); + + auto generator3 = PARAMETRIC_D(){ + auto ctx = new Context(1); + auto stride = p.getIntParam("stride"); + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length"), stride}); + + NDArray* strided; + if(stride == 1){ + strided = arr; + } else { + IndicesList indices({NDIndex::all(), NDIndex::point(0)}); + strided = arr->subarray(indices); //All rows, first column + delete arr; + } + + strided->assign(1.0); + ctx->setInputArray(0, strided, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {1}), true); + auto iargs = new Nd4jLong[1]; + iargs[0] = 0; + ctx->setIArguments(iargs, 1); + delete[] iargs; + return ctx; + }; + + nd4j::ops::argmax opArgmax; + DeclarableBenchmark dbArgmax(opArgmax, "stridedArgmax"); + output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Strided Argmax"); + return output; + } + + static std::string fastReduceAlongDimBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + int length[] = {1024*1024, 64*1024*1024}; + int powLimit[] = {10, 20, 26}; + int powStep[] = {2, 2, 4}; + + for( int i=0; i < limit3; i++ ){ + IntPowerParameters rows("rows", 2, 0, powLimit[i], powStep[i]); + BoolParameters dim("dim"); + + + ParametersBatch batch({&rows, &dim}); + + auto generator = PARAMETRIC_XYZ() { + int rows = p.getIntParam("rows"); + int cols = length[i] / rows; + int dim = p.getIntParam("dim"); + auto arr = NDArrayFactory::create_('c', {rows, cols}); + + + x.push_back(arr); + y.push_back(NDArrayFactory::create_(dim)); + + NDArray* result; + if(dim == 0){ + result = NDArrayFactory::create_('c', {cols}); + } else { + result = NDArrayFactory::create_('c', {rows}); + } + z.push_back(result); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum"); + ReductionBenchmark rbMax(reduce::SameOps::Max, "max"); + + std::string s1("Sum Along Dimension - "); + s1 += std::to_string(length[i]); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, s1.c_str()); + + + auto generator3 = PARAMETRIC_D(){ + auto ctx = new Context(1); + int rows = p.getIntParam("rows"); + int cols = length[i] / rows; + int dim = p.getIntParam("dim"); + auto arr = NDArrayFactory::create_('c', {rows, cols}); + + Nd4jLong* dimArg = new Nd4jLong[1]; + dimArg[0] = dim; + ctx->setIArguments(dimArg, 1); + delete[] dimArg; + + ctx->setInputArray(0, arr, true); + + NDArray* result; + if(dim == 0){ + result = NDArrayFactory::create_('c', {cols}); + } else { + result = NDArrayFactory::create_('c', {rows}); + } + ctx->setOutputArray(0, result, true); + return ctx; + }; + + std::string s5("Argmax Along Dimension - "); + s5 += std::to_string(length[i]); + + nd4j::ops::argmax opArgmax; + DeclarableBenchmark dbArgmax(opArgmax, "Argmax"); + output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str()); + } + + return output; + } + + static std::string fastReduceToScalarBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 10, reduceScalarPowLimit, 4); //2^10 to 2^26 in steps of 4 + + ParametersBatch batch({&length}); + + auto generator = PARAMETRIC_XYZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + + x.push_back(arr); + y.push_back(nullptr); + z.push_back(NDArrayFactory::create_(0.0f)); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum"); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, "Sum - Full Array Reduction"); + + //Index reduction + nd4j::ops::argmax opArgmax; + DeclarableBenchmark dbArgmax(opArgmax, "Argmax"); + auto generator3 = PARAMETRIC_D(){ + auto ctx = new Context(1); + + ctx->setInputArray(0, NDArrayFactory::create_('c', {p.getIntParam("length")}), true); + ctx->setInputArray(1, NDArrayFactory::create_((Nd4jLong)0), true); + ctx->setOutputArray(0, NDArrayFactory::create_(0), true); + + return ctx; + }; + output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction"); + + return output; + } + + static std::string fastNonEwsTransformBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + IntPowerParameters rowcol("rowcol", 2, 2, nonEwsPowLimit, 4); //2^2 to 2^14 in steps of 4 -> non-inplace case: 2x 2^10 x 2^10 = 128mb + BoolParameters inplace("inplace"); + + ParametersBatch batch({&rowcol, &inplace}); + + auto generator = PARAMETRIC_XZ() { + int r = p.getIntParam("rowcol"); + auto arr = NDArrayFactory::create_('c', {r, r+1}); + IndicesList indices({NDIndex::all(), NDIndex::interval(0,r-1)}); + auto view = arr->subarray(indices); + //nd4j_printf("VIEW ARRAY: rows=%lld, columns=%lld", view->sizeAt(0), view->sizeAt(1)); + x.push_back(view); + if(p.getIntParam("inplace") == 1){ + z.push_back(view); + } else { + z.push_back(NDArrayFactory::create_('c', {r,r})); + } + delete arr; + }; + + ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU_View"); + sbLRelu.setY(NDArrayFactory::create_(0.0)); + + TransformBenchmark tbExp(transform::StrictOps::Exp, "exp view"); + + output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU View"); + output += helper.runOperationSuit(&tbExp, generator, batch, "Exp View"); + + return output; + } + + static std::string fastPairwiseBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + IntPowerParameters length("length", 2, 10, pairwisePowLimit, 4); //2^10 to 2^26 in steps of 4 -> max is 512mb + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XYZ() { + auto arr1 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + auto arr2 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + x.push_back(arr1); + y.push_back(arr2); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr1); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + PairwiseBenchmark pb1(pairwise::Ops::Add, "Add"); + output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add"); + + PairwiseBenchmark pb2(pairwise::Ops::Add, "Multiply"); + output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Multiply"); + + return output; + } + + static std::string heavyTransformsBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + IntPowerParameters length("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22, steps of 4 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if (p.getIntParam("inplace") == 1) { + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + //Ops to test: erf (transform), betainc (custom), polygamma, synthetic ops? + TransformBenchmark erf(transform::StrictOps::Erf, "Erf"); + output += helper.runOperationSuit(&erf, generator, batch, "Error Function (Erf)"); + + ParametersBatch batch2({&length}); + nd4j::ops::polygamma op1; + DeclarableBenchmark pg(op1, "polygamma"); + auto generator2 = PARAMETRIC_D() { + auto ctx = new Context(1); + auto in0 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + in0->assign(0.25); + auto in1 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + in1->assign(0.5); + ctx->setInputArray(0, in0, true); + ctx->setInputArray(1, in1, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {p.getIntParam("length")}), true); + return ctx; + }; + + + IntPowerParameters lengthBetaInc("length", 2, 10, heavyPowLimit, 4); //2^10 to 2^22 in steps of 4 + ParametersBatch batch3({&lengthBetaInc}); + nd4j::ops::betainc op2; + DeclarableBenchmark binc(op2, "betainc"); + auto generator3 = PARAMETRIC_D() { + auto ctx = new Context(1); + auto in0 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + in0->assign(0.25); + auto in1 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + in1->assign(0.5); + auto in2 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + in2->assign(0.75); + ctx->setInputArray(0, in0, true); + ctx->setInputArray(1, in1, true); + ctx->setInputArray(2, in2, true); + ctx->setOutputArray(0, NDArrayFactory::create_('c', {p.getIntParam("length")}), true); + return ctx; + }; + + output += helper.runOperationSuit(&pg, generator2, batch2, "PolyGamma Function"); + output += helper.runOperationSuit(&binc, generator3, batch3, "Incomplete Beta Function (BetaInc)"); + + return output; + } + + static std::string intermediateTransformsBenchmark() { + std::string output; + + //Non-inplace: 2x 2^26 elements FP32 -> 512MB + BenchmarkHelper helper(wIterations, rIterations); + IntPowerParameters length("length", 2, 10, intermediateTransformPowLimit, 4); //2^20 to 2^22 in steps of 4 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + TransformBenchmark tbTanh(transform::StrictOps::Tanh, "tanh"); + TransformBenchmark tbGelu(transform::StrictOps::GELU, "gelu"); + + output += helper.runOperationSuit(&tbTanh, generator, batch, "Tanh"); + output += helper.runOperationSuit(&tbGelu, generator, batch, "gelu"); + + + //2x 1024 cols x 2^18 = 2GB + IntPowerParameters rows("rows", 2, 10, intermediateTransformPowLimit2, 4); + PredefinedParameters cols("cols", {4, 128, 1024}); + + ParametersBatch batch2({&rows, &cols, &inplace}); + + auto generator2 = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("rows"), p.getIntParam("cols")})); + } + }; + + TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax"); + + output += helper.runOperationSuit(&tbSoftmax, generator2, batch2, "Softmax"); + + return output; + } + + static std::string fastTransformsBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + IntPowerParameters length("length", 2, 10, transformBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU"); + sbLRelu.setY(NDArrayFactory::create_(0.0)); + + TransformBenchmark tbAbs(transform::SameOps::Abs, "abs"); + TransformBenchmark tbExp(transform::StrictOps::Exp, "exp"); + + output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU"); + output += helper.runOperationSuit(&tbAbs, generator, batch, "Abs"); + output += helper.runOperationSuit(&tbExp, generator, batch, "Exp"); + + return output; + } + + static std::string fastScalarBenchmark() { + std::string output; + BenchmarkHelper helper(wIterations, rIterations); + + IntPowerParameters length("length", 2, 10, scalarBenchmarkPowLimit, 4); //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd"); + ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv"); + ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow"); + + + sbAdd.setY(NDArrayFactory::create_(3.14159265359)); + sbDiv.setY(NDArrayFactory::create_(3.14159265359)); + sbPow.setY(NDArrayFactory::create_(3.14159265359)); + + + output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359) - F32"); + output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359) - F32"); + output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359) - F32"); + + return output; + } + + + static long nowMs(){ + auto s = std::chrono::system_clock::now().time_since_epoch(); + auto v = std::chrono::duration_cast(s).count(); + return v; + } + + static long duration(long start){ + return nowMs() - start; + } + + static long done(long start){ + long dur = duration(start); + nd4j_printf("Done: %i ms\n", dur); + return nowMs(); + } + + + std::string FullBenchmarkSuit::runSuit() { + std::string result; + + long start = nowMs(); + + // set 1 + nd4j_printf("Running FullBenchmarkSuite.fastScalarBenchmark\n", ""); + result += fastScalarBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastTransformsBenchmark\n", ""); + result += fastTransformsBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.intermediateTransformsBenchmark\n", ""); + result += intermediateTransformsBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastPairwiseBenchmark\n", ""); + result += fastPairwiseBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.heavyTransformsBenchmark\n", ""); + result += heavyTransformsBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastNonEwsTransformBenchmark\n", ""); + result += fastNonEwsTransformBenchmark(); + start = done(start); + + // set 2 + nd4j_printf("Running FullBenchmarkSuite.fastReduceToScalarBenchmark\n", ""); + result += fastReduceToScalarBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastReduceAlongDimBenchmark\n", ""); + result += fastReduceAlongDimBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionsRegular\n", ""); + result += fastStridedReductionsRegular(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionIrregular\n", ""); + result += fastStridedReductionIrregular(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionNonEws\n", ""); + result += fastStridedReductionNonEws(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.broadcastBenchmark\n", ""); + result += broadcastBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.broadcast2dBenchmark\n", ""); + result += broadcast2dBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.broadcastOpsMatrixBenchmark\n", ""); + result += broadcastOpsMatrixBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.mismatchedOrdersAssignBenchmark\n", ""); + result += mismatchedOrdersAssignBenchmark(); + start = done(start); + + + // set 3 + nd4j_printf("Running FullBenchmarkSuite.gatherOpBenchmark\n", ""); + result += gatherOpBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.scatterOpBenchmark\n", ""); + result += scatterOpBenchmark(); + start = done(start); + + // set 4 + nd4j_printf("Running FullBenchmarkSuite.gemmRegularBenchmark\n", ""); + result += gemmRegularBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.gemmIrregularBenchmark\n", ""); + result += gemmIrregularBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.rngBenchmark\n", ""); + result += rngBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.conv2dBenchmark\n", ""); + result += conv2dBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.pool2dBenchmark\n", ""); + result += pool2dBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.batchnormBenchmark\n", ""); + result += batchnormBenchmark(); + start = done(start); + + nd4j_printf("Running FullBenchmarkSuite.lstmBenchmark\n", ""); + result += lstmBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.conv3dBenchmark\n", ""); + result += conv3dBenchmark(); + start = done(start); + nd4j_printf("Running FullBenchmarkSuite.maxPool3DBenchmark\n", ""); + result += maxPool3DBenchmark(); + start = done(start); +// nd4j_printf("Running FullBenchmarkSuite.layerNormBenchmark\n", ""); +// result += layerNormBenchmark(); +// start = done(start); + + return result; + } + + +} \ No newline at end of file diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp new file mode 100644 index 000000000..ae9db9b6c --- /dev/null +++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp @@ -0,0 +1,639 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include "performance/benchmarking/LightBenchmarkSuit.h" + +#ifdef _RELEASE +#define WARMUP 3 +#define NUM_ITER 10 + +#else + +#define WARMUP 0 +#define NUM_ITER 1 + +#endif + +namespace nd4j { + + template + static std::string transformBenchmark() { + std::string output; + output += "transformBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT()); + + BenchmarkHelper helper(WARMUP, NUM_ITER); + IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20 - 4MB + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + ScalarBenchmark sbRelu(scalar::Ops::RELU, "RELU"); + sbRelu.setY(NDArrayFactory::create_(0.0)); + + TransformBenchmark tbSigmoid(transform::StrictOps::Sigmoid, "sigmoid"); + TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax"); + + output += helper.runOperationSuit(&sbRelu, generator, batch, "RELU"); + output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Sigmoid"); + output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Softmax"); + + return output; + } + + template + static std::string scalarBenchmark() { + std::string output; + output += "scalarBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT()); + + BenchmarkHelper helper(WARMUP, NUM_ITER); + + IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + arr->assign(1.0); + x.push_back(arr); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd"); + ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv"); + ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow"); + + + sbAdd.setY(NDArrayFactory::create_(3.14159265359)); + sbDiv.setY(NDArrayFactory::create_(3.14159265359)); + sbPow.setY(NDArrayFactory::create_(3.14159265359)); + + + output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359)"); + output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359)"); + output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359)"); + + return output; + } + + + template + static std::string pairwiseBenchmark() { + std::string output; + output += "pairwiseBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT()); + + BenchmarkHelper helper(WARMUP, NUM_ITER); + IntPowerParameters length("length", 2, 8, 20, 4); //2^4 to 2^20 in steps of 4 - 2^4, 2^8, 2^16, 2^20 + BoolParameters inplace("inplace"); + + ParametersBatch batch({&length, &inplace}); + + auto generator = PARAMETRIC_XYZ() { + auto arr1 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + auto arr2 = NDArrayFactory::create_('c', {p.getIntParam("length")}); + x.push_back(arr1); + y.push_back(arr2); + if(p.getIntParam("inplace") == 1){ + z.push_back(arr1); + } else { + z.push_back(NDArrayFactory::create_('c', {p.getIntParam("length")})); + } + }; + + PairwiseBenchmark pb1(pairwise::Ops::Add, "Add"); + output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add"); + + PairwiseBenchmark pb2(pairwise::Ops::Divide, "Divide"); + output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Divide"); + + return output; + } + + static std::string mismatchedOrderAssign() { + std::string output; + BenchmarkHelper helper(WARMUP, NUM_ITER); + + IntPowerParameters rows("rows", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20 + BoolParameters cf("cf"); + + ParametersBatch batch({&rows, &cf}); + + auto generator = PARAMETRIC_XZ() { + int numElements = 4194304; //2^24 + int rows = p.getIntParam("rows"); + int cols = numElements / rows; + bool c = p.getIntParam("cf"); + + auto arr = NDArrayFactory::create_(c ? 'c' : 'f', {rows, cols}); + auto arr2 = NDArrayFactory::create_(c ? 'f' : 'c', {rows, cols}); + x.push_back(arr); + z.push_back(arr2); + }; + + TransformBenchmark tb(transform::AnyOps::Assign, "assign"); + output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign F32"); + + //Also test: NCHW to NHWC and back + BoolParameters nchw("nchw"); + int mb = 8; + int hw = 64; + int c = 3; + ParametersBatch batch2({&nchw}); + auto generator2 = PARAMETRIC_XZ() { + bool nchw = p.getIntParam("nchw"); + + if(nchw) { + auto orig = NDArrayFactory::create_('c', {mb, c, hw, hw}); + orig->permutei({0,2,3,1}); + x.push_back(orig); + z.push_back(NDArrayFactory::create_('c', {mb, hw, hw, c})); + } else { + auto orig = NDArrayFactory::create_('c', {mb, hw, hw, c}); + orig->permutei({0,3,1,2}); + x.push_back(orig); + z.push_back(NDArrayFactory::create_('c', {mb, c, hw, hw})); + } + }; + + TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw"); + output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign FP32"); + return output; + } + + template + static std::string gemmBenchmark() { + std::string output; + output += "gemm " + DataTypeUtils::asString(DataTypeUtils::fromT()); + BenchmarkHelper helper(WARMUP, NUM_ITER); + + for (int o = 0; o <= 1; o++) { + char resultOrder = (o == 0 ? 'f' : 'c'); + IntPowerParameters sz("sz", 2, 4, 10, 2); //2^4=16, ..., 2^10=1024 -> 4 elements + + ParametersBatch b({&sz}); + + auto generator = PARAMETRIC_XYZ() { + auto a = p.getIntParam("sz"); + auto b = p.getIntParam("sz"); + auto c = p.getIntParam("sz"); + std::vector shapeA; + std::vector shapeB; + shapeA = {a, b}; + shapeB = {b, c}; + auto A = NDArrayFactory::create_('c', shapeA); + auto B = NDArrayFactory::create_('c', shapeB); + auto C = NDArrayFactory::create_(resultOrder, {a, c}); + + x.push_back(A); + y.push_back(B); + z.push_back(C); + }; + + std::string n; + n += "Gemm - cOrder="; + n += resultOrder; + + MatrixBenchmark mb(1.0, 0.0, false, false, n); + + output += helper.runOperationSuit(&mb, generator, b, n.c_str()); + } + + return output; + } + + template + static std::string reduceFullBenchmark() { + std::string output; + output += "reduceFullBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT()); + + BenchmarkHelper helper(WARMUP, NUM_ITER); + + IntPowerParameters length("length", 2, 8, 20, 4); //2^8, 2^12, 2^16, 2^20 + + ParametersBatch batch({&length}); + + auto generator = PARAMETRIC_XYZ() { + auto arr = NDArrayFactory::create_('c', {p.getIntParam("length")}); + + x.push_back(arr); + y.push_back(nullptr); + z.push_back(NDArrayFactory::create_(0.0f)); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum"); + ReductionBenchmark rbProd(reduce::SameOps::Prod, "prod"); + ReductionBenchmark rbMax(reduce::SameOps::Max, "max"); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, "Sum - Full Array Reduction"); + output += helper.runOperationSuit(&rbProd, (const std::function)(generator), batch, "Product - Full Array Reduction"); + output += helper.runOperationSuit(&rbMax, (const std::function)(generator), batch, "Maximum - Full Array Reduction"); + + //Index reduction + nd4j::ops::argmax opArgmax; + DeclarableBenchmark dbArgmax(opArgmax, "Argmax"); + auto generator3 = PARAMETRIC_D(){ + auto ctx = new Context(1); + + ctx->setInputArray(0, NDArrayFactory::create_('c', {p.getIntParam("length")}), true); + ctx->setInputArray(1, NDArrayFactory::create_((Nd4jLong)0), true); + ctx->setOutputArray(0, NDArrayFactory::create_(0), true); + + return ctx; + }; + output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction"); + return output; + } + + template + static std::string reduceDimBenchmark(){ + std::string output; + output += "reduceDimBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT()); + + BenchmarkHelper helper(WARMUP, NUM_ITER); + + int length[] = {1024*1024}; + int pow[] = {10}; + + for( int i=0; i<1; i++ ){ + IntPowerParameters rows("rows", 2, 0, pow[i], 2); + BoolParameters dim("dim"); + + + ParametersBatch batch({&rows, &dim}); + + auto generator = PARAMETRIC_XYZ() { + int rows = p.getIntParam("rows"); + int cols = length[i] / rows; + int dim = p.getIntParam("dim"); + auto arr = NDArrayFactory::create_('c', {rows, cols}); + + + x.push_back(arr); + y.push_back(NDArrayFactory::create_(dim)); + + NDArray* result; + if(dim == 0){ + result = NDArrayFactory::create_('c', {cols}); + } else { + result = NDArrayFactory::create_('c', {rows}); + } + z.push_back(result); + }; + + ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum"); + ReductionBenchmark rbMax(reduce::SameOps::Max, "max"); + + std::string s1("Sum Along Dimension - "); + s1 += std::to_string(length[i]); + std::string s3("Maximum Along Dimension - "); + s3 += std::to_string(length[i]); + + output += helper.runOperationSuit(&rbSum, (const std::function)(generator), batch, s1.c_str()); + output += helper.runOperationSuit(&rbMax, (const std::function)(generator), batch, s3.c_str()); + + + + auto generator3 = PARAMETRIC_D(){ + auto ctx = new Context(1); + int rows = p.getIntParam("rows"); + int cols = length[i] / rows; + int dim = p.getIntParam("dim"); + auto arr = NDArrayFactory::create_('c', {rows, cols}); + + auto dimArg = new Nd4jLong[1]; + dimArg[0] = dim; + ctx->setIArguments(dimArg, 1); + delete[] dimArg; + + ctx->setInputArray(0, arr, true); + + NDArray* result; + if(dim == 0){ + result = NDArrayFactory::create_('c', {cols}); + } else { + result = NDArrayFactory::create_('c', {rows}); + } + ctx->setOutputArray(0, result, true); + return ctx; + }; + + std::string s5("Argmax Along Dimension - "); + s5 += std::to_string(length[i]); + + nd4j::ops::argmax opArgmax; + DeclarableBenchmark dbArgmax(opArgmax, "Argmax"); + output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str()); + } + return output; + } + + template + static std::string conv2d(){ + std::string output; + output += "conv2d " + DataTypeUtils::asString(DataTypeUtils::fromT()); + BenchmarkHelper helper(WARMUP, NUM_ITER); + + //Convolution2D op + BoolParameters nhwc("nhwc"); + PredefinedParameters k("k", {2, 3}); + + ParametersBatch batch({&nhwc, &k}); + nd4j::ops::conv2d conv2d; + DeclarableBenchmark benchmark(conv2d, "conv2d"); + + int hw = 64; + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + int khw = p.getIntParam("k"); + + if (n == 0) { + auto input = NDArrayFactory::create_('c', {8, 3, hw, hw}); + auto output = NDArrayFactory::create_('c', {8, 3, hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } else { + auto input = NDArrayFactory::create_('c', {8, hw, hw, 3}); + auto output = NDArrayFactory::create_('c', {8, hw, hw, 3}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } + + auto b = NDArrayFactory::create_('c', {3}); + auto w = NDArrayFactory::create_('c', {khw, khw, 3, 3}); // [kH, kW, iC, oC] always + + ctx->setInputArray(1, w, true); + ctx->setInputArray(2, b, true); + + auto args = new Nd4jLong[10]; + args[0] = args[1] = khw; //Kernel + args[2] = args[3] = 1;//Stride + args[4] = args[5] = 0; //Pad + args[6] = args[7] = 1; //Dilation + args[8] = 1; //SAME + args[9] = n;//0-nchw, 1=nhwc + ctx->setIArguments(args, 10); + delete[] args; + + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d"); + return output; + } + + template + static std::string pool2d() { + std::string output; + output += "pool2d " + DataTypeUtils::asString(DataTypeUtils::fromT()); + BenchmarkHelper helper(WARMUP, NUM_ITER); + + //Convolution2D op + BoolParameters nhwc("nhwc"); + PredefinedParameters k("k", {2, 3}); + + ParametersBatch batch({&nhwc, &k}); + + int c = 3; + int hw = 64; + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int n = p.getIntParam("nhwc"); + int khw = p.getIntParam("k"); + + if (n == 0) { + auto input = NDArrayFactory::create_('c', {8, c, hw, hw}); + auto output = NDArrayFactory::create_('c', {8, c, hw, hw}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } else { + auto input = NDArrayFactory::create_('c', {8, hw, hw, c}); + auto output = NDArrayFactory::create_('c', {8, hw, hw, c}); + ctx->setInputArray(0, input, true); + ctx->setOutputArray(0, output, true); + } + + auto args = new Nd4jLong[11]; + args[0] = args[1] = khw; //Kernel + args[2] = args[3] = 1;//Stride + args[4] = args[5] = 0; //Pad + args[6] = args[7] = 1; //Dilation + args[8] = 1; //SAME + args[9] = 0; //Divisor mode - 0 = exclude padding in divisor + args[10] = n;//0-nchw, 1=nhwc + ctx->setIArguments(args, 11); + delete[] args; + + return ctx; + }; + + nd4j::ops::avgpool2d avgpool2d; + DeclarableBenchmark benchmark1(avgpool2d, "avgpool"); + output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pool 2d"); + + nd4j::ops::maxpool2d maxpool2d; + DeclarableBenchmark benchmark2(maxpool2d, "maxpool"); + output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pool 2d"); + return output; + } + + template + static std::string lstmBenchmark() { + std::string output; + output += "lstm " + DataTypeUtils::asString(DataTypeUtils::fromT()); + BenchmarkHelper helper(WARMUP, NUM_ITER); + + BoolParameters format("format"); //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen] + PredefinedParameters mb("mb", {1, 8}); + int n = 128; + + ParametersBatch batch({&format, &mb}); + nd4j::ops::lstmBlock lstmBlock; + DeclarableBenchmark benchmark(lstmBlock, "lstm"); + + int seqLength = 8; + + auto generator = PARAMETRIC_D() { + auto ctx = new Context(1); + int f = p.getIntParam("format"); + int m = p.getIntParam("mb"); + + Nd4jLong l = 0; + ctx->setInputArray(0, NDArrayFactory::create_(l), true); //Max TS length (unused) + + + if (f == 0) { + //TNS format + ctx->setInputArray(1, NDArrayFactory::create_('c', {seqLength, m, n}), true); //x + ctx->setOutputArray(0, NDArrayFactory::create_('c', {seqLength, m, n}), true); //i + ctx->setOutputArray(1, NDArrayFactory::create_('c', {seqLength, m, n}), true); //c + ctx->setOutputArray(2, NDArrayFactory::create_('c', {seqLength, m, n}), true); //f + ctx->setOutputArray(3, NDArrayFactory::create_('c', {seqLength, m, n}), true); //o + ctx->setOutputArray(4, NDArrayFactory::create_('c', {seqLength, m, n}), true); //z + ctx->setOutputArray(5, NDArrayFactory::create_('c', {seqLength, m, n}), true); //h + ctx->setOutputArray(6, NDArrayFactory::create_('c', {seqLength, m, n}), true); //y + } else { + //NST format + ctx->setInputArray(1, NDArrayFactory::create_('f', {m, n, seqLength}), true); //x + ctx->setOutputArray(0, NDArrayFactory::create_('f', {m, n, seqLength}), true); //i + ctx->setOutputArray(1, NDArrayFactory::create_('f', {m, n, seqLength}), true); //c + ctx->setOutputArray(2, NDArrayFactory::create_('f', {m, n, seqLength}), true); //f + ctx->setOutputArray(3, NDArrayFactory::create_('f', {m, n, seqLength}), true); //o + ctx->setOutputArray(4, NDArrayFactory::create_('f', {m, n, seqLength}), true); //z + ctx->setOutputArray(5, NDArrayFactory::create_('f', {m, n, seqLength}), true); //h + ctx->setOutputArray(6, NDArrayFactory::create_('f', {m, n, seqLength}), true); //y + } + + auto cLast = NDArrayFactory::create_('c', {m, n}); + auto yLast = NDArrayFactory::create_('c', {m, n}); + auto W = NDArrayFactory::create_('c', {2 * n, 4 * n}); + auto Wci = NDArrayFactory::create_('c', {n}); + auto Wcf = NDArrayFactory::create_('c', {n}); + auto Wco = NDArrayFactory::create_('c', {n}); + auto b = NDArrayFactory::create_('c', {4 * n}); + + ctx->setInputArray(2, cLast, true); + ctx->setInputArray(3, yLast, true); + ctx->setInputArray(4, W, true); + ctx->setInputArray(5, Wci, true); + ctx->setInputArray(6, Wcf, true); + ctx->setInputArray(7, Wco, true); + ctx->setInputArray(8, b, true); + + auto iargs = new Nd4jLong[2]; + iargs[0] = 0; //No peephole + iargs[1] = f; + ctx->setIArguments(iargs, 2); + delete[] iargs; + + auto targs = new double[2]; + targs[0] = 1.0; //forget bias + targs[1] = 0.0; //cell clipping value + ctx->setTArguments(targs, 2); + delete[] targs; + return ctx; + }; + + output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock"); + return output; + } + + static std::string broadcast2d() { + std::string output; + BenchmarkHelper helper(WARMUP, NUM_ITER); + + int rows = 65536; + IntPowerParameters cols("cols", 2, 2, 12, 4); //2^2 to 2^12 in steps of 2 - 2^1=2, ..., 2^10=1024 + BoolParameters axis("axis"); + BoolParameters inplace("inplace"); + + ParametersBatch batch({&cols, &axis, &inplace}); + + auto generator = PARAMETRIC_D() { + auto a = p.getIntParam("axis"); + auto arr = NDArrayFactory::create_('c', {rows, p.getIntParam("cols")}); + + auto ctx = new Context(1); + ctx->setInputArray(0, arr, true); + if(a == 0){ + ctx->setInputArray(1, NDArrayFactory::create_('c', {rows, 1}), true); + } else { + ctx->setInputArray(1, NDArrayFactory::create_('c', {1, p.getIntParam("cols")}), true); + } + if (p.getIntParam("inplace") == 1) { + ctx->setOutputArray(0, arr); + ctx->markInplace(true); + } else { + ctx->setOutputArray(0, NDArrayFactory::create_('c', {rows, p.getIntParam("cols")}), true); + } + return ctx; + }; + + std::string s("add"); + nd4j::ops::add op; + DeclarableBenchmark benchmark(op, "add"); + output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d"); + return output; + } + + std::string LightBenchmarkSuit::runSuit() { +#ifdef _RELEASE + std::vector dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF}); +#else + std::vector dtypes({nd4j::DataType::FLOAT32}); +#endif + + std::string result; + + for (auto t:dtypes) { + nd4j_printf("Running LightBenchmarkSuite.transformBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += transformBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.scalarBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += scalarBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.reduceDimBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += reduceDimBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.gemmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += gemmBenchmark, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.conv2d [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += conv2d, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.pool2d [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += pool2d, (), LIBND4J_TYPES); + + nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); + BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES); + } + + nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", ""); + result += broadcast2d(); + nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", ""); + result += mismatchedOrderAssign(); + + return result; + } +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp index 08108d69c..95601ce4e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp @@ -311,3 +311,27 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_1) { delete result; } + +TEST_F(DeclarableOpsTests15, test_lstmBlock_2) { + int seqLength = 32; + int m = 64; + int n = 32; + + auto x0 = NDArrayFactory::create(5); + auto x1 = NDArrayFactory::create('f', {m, n, seqLength}); + auto x2 = NDArrayFactory::create('f', {m, n}); + auto x3 = NDArrayFactory::create('f', {m, n}); + auto x4 = NDArrayFactory::create('f', {2 * n, 4 * n}); + auto x5 = NDArrayFactory::create('f', {n}); + auto x6 = NDArrayFactory::create('f', {n}); + auto x7 = NDArrayFactory::create('f', {n}); + auto x8 = NDArrayFactory::create('f', {4 * n}); + + nd4j::ops::lstmBlock op; + auto result = op.execute({&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8}, {1.0, 0.0}, {0, 1}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + delete result; +} diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 67a2585e0..c94758c5a 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -38,6 +38,8 @@ #include #include #include +#include +#include using namespace nd4j; using namespace nd4j::graph; @@ -164,6 +166,12 @@ TEST_F(PlaygroundTests, BroadcastOps2d) { } */ +TEST_F(PlaygroundTests, test_benchmark_suit_1) { + //LightBenchmarkSuit suit; + //auto output = suit.runSuit(); + //nd4j_printf("SUIT OUTPUT\n%s\n", output.data()); +} + TEST_F(PlaygroundTests, test_small_reductions) { auto f = NDArrayFactory::create('c', {1024 ,1024}); f.assign(1.0f); diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt index 38e3b9523..1ac373676 100644 --- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt @@ -193,6 +193,7 @@ if ("${OPENBLAS}" OR CMAKE_BUILD_TYPE STREQUAL "Release") endif() endif() +file(GLOB_RECURSE PERF_SOURCES false ../../include/performance/*.cpp ../../include/performance/*.h) file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../../include/exceptions/*.cpp ../../include/exceptions/*.h) file(GLOB_RECURSE EXEC_SOURCES false ../../include/execution/*.cpp ../../include/execution/*.h) file(GLOB_RECURSE TYPES_SOURCES false ../../include/types/*.cpp ../../include/types/*.h) @@ -234,7 +235,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas ../../include/cnpy/cnpy.cpp ../../include/nd4jmemset.h ../../include/nd4jmalloc.h ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES} - ${OPS_SOURCES} ${TEST_SOURCES}) + ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES}) target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES}) diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java index 2f2f2478c..330fb1c31 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java @@ -917,4 +917,14 @@ public class DefaultOpExecutioner implements OpExecutioner { public DataBuffer createConstantBuffer(double[] values, DataType desiredType) { throw new UnsupportedOperationException(); } + + @Override + public String runLightBenchmarkSuit(boolean printOut) { + throw new UnsupportedOperationException(); + } + + @Override + public String runFullBenchmarkSuit(boolean printOut) { + throw new UnsupportedOperationException(); + } } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java index c4b39d653..1be417644 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java @@ -463,4 +463,8 @@ public interface OpExecutioner { DataBuffer createConstantBuffer(int[] values, DataType desiredType); DataBuffer createConstantBuffer(float[] values, DataType desiredType); DataBuffer createConstantBuffer(double[] values, DataType desiredType); + + + String runLightBenchmarkSuit(boolean printOut); + String runFullBenchmarkSuit(boolean printOut); } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java index 1ac932584..c6413d411 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java @@ -1386,15 +1386,39 @@ public class Nd4j { */ public static DataBuffer createBufferDetached(int[] shape, DataType type) { long length = ArrayUtil.prodLong(shape); - if (type == DataType.INT) - return DATA_BUFFER_FACTORY_INSTANCE.createInt(length); - if (type == DataType.LONG) - return DATA_BUFFER_FACTORY_INSTANCE.createLong(new long[]{length}); - else if (type == DataType.HALF) - return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length); - - return type == DataType.DOUBLE ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length); - + switch (type){ + case DOUBLE: + return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length); + case FLOAT: + return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length); + case HALF: + return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length); + case BFLOAT16: + return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length); + case UINT64: + return DATA_BUFFER_FACTORY_INSTANCE.createULong(length); + case LONG: + return DATA_BUFFER_FACTORY_INSTANCE.createLong(length); + case UINT32: + return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length); + case INT: + return DATA_BUFFER_FACTORY_INSTANCE.createInt(length); + case UINT16: + return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length); + case SHORT: + return DATA_BUFFER_FACTORY_INSTANCE.createShort(length); + case UBYTE: + return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length); + case BYTE: + return DATA_BUFFER_FACTORY_INSTANCE.createByte(length); + case BOOL: + return DATA_BUFFER_FACTORY_INSTANCE.createBool(length); + case UTF8: + case COMPRESSED: + case UNKNOWN: + default: + throw new UnsupportedOperationException("Cannot create type: " + type); + } } /** @@ -1403,16 +1427,39 @@ public class Nd4j { public static DataBuffer createBuffer(long[] shape, DataType type) { long length = ArrayUtil.prodLong(shape); - if (type == DataType.INT) - return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - else if (type == DataType.LONG) - return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - else if (type == DataType.HALF) - return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - else if (type == DataType.DOUBLE) - return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - else - return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + switch (type) { + case BOOL: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case UBYTE: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case UINT16: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case UINT32: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case UINT64: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case BYTE: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case SHORT: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case INT: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case LONG: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case HALF: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case BFLOAT16: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case FLOAT: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case DOUBLE: + return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace()); + case UTF8: + case COMPRESSED: + case UNKNOWN: + default: + throw new UnsupportedOperationException("Cannot create type: " + type); + } } @@ -1424,19 +1471,31 @@ public class Nd4j { switch (type){ case DOUBLE: - DATA_BUFFER_FACTORY_INSTANCE.createDouble(length); + return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length); case FLOAT: - DATA_BUFFER_FACTORY_INSTANCE.createFloat(length); + return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length); case HALF: return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length); + case BFLOAT16: + return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length); + case UINT64: + return DATA_BUFFER_FACTORY_INSTANCE.createULong(length); case LONG: return DATA_BUFFER_FACTORY_INSTANCE.createLong(length); + case UINT32: + return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length); case INT: return DATA_BUFFER_FACTORY_INSTANCE.createInt(length); + case UINT16: + return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length); case SHORT: + return DATA_BUFFER_FACTORY_INSTANCE.createShort(length); case UBYTE: + return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length); case BYTE: + return DATA_BUFFER_FACTORY_INSTANCE.createByte(length); case BOOL: + return DATA_BUFFER_FACTORY_INSTANCE.createBool(length); case UTF8: case COMPRESSED: case UNKNOWN: diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java index 278a4e39f..e5990f981 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java @@ -1161,4 +1161,10 @@ public abstract class NativeOps extends Pointer { public abstract Pointer constantBuffer(int dtype, DoublePointer data, int length); public abstract Pointer constantBuffer(int dtype, @Cast("Nd4jLong *") LongPointer data, int length); + + public abstract String runLightBenchmarkSuit(boolean printOut); + + public abstract String runFullBenchmarkSuit(boolean printOut); + + public abstract long getCachedMemory(int deviceId); } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java index dcb644468..4b4bff588 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java @@ -418,6 +418,126 @@ public class CudaDataBufferFactory implements DataBufferFactory { return new CudaIntDataBuffer(length); } + @Override + public DataBuffer createBFloat16(long length) { + return new CudaBfloat16DataBuffer(length); + } + + @Override + public DataBuffer createUInt(long length) { + return new CudaUInt32DataBuffer(length); + } + + @Override + public DataBuffer createUShort(long length) { + return new CudaUInt16DataBuffer(length); + } + + @Override + public DataBuffer createUByte(long length) { + return new CudaUByteDataBuffer(length); + } + + @Override + public DataBuffer createULong(long length) { + return new CudaUInt64DataBuffer(length); + } + + @Override + public DataBuffer createBool(long length) { + return new CudaBoolDataBuffer(length); + } + + @Override + public DataBuffer createShort(long length) { + return new CudaShortDataBuffer(length); + } + + @Override + public DataBuffer createByte(long length) { + return new CudaByteDataBuffer(length); + } + + @Override + public DataBuffer createBFloat16(long length, boolean initialize) { + return new CudaBfloat16DataBuffer(length, initialize); + } + + @Override + public DataBuffer createUInt(long length, boolean initialize) { + return new CudaUInt32DataBuffer(length, initialize); + } + + @Override + public DataBuffer createUShort(long length, boolean initialize) { + return new CudaUInt16DataBuffer(length, initialize); + } + + @Override + public DataBuffer createUByte(long length, boolean initialize) { + return new CudaUByteDataBuffer(length, initialize); + } + + @Override + public DataBuffer createULong(long length, boolean initialize) { + return new CudaUInt64DataBuffer(length, initialize); + } + + @Override + public DataBuffer createBool(long length, boolean initialize) { + return new CudaBoolDataBuffer(length, initialize); + } + + @Override + public DataBuffer createShort(long length, boolean initialize) { + return new CudaShortDataBuffer(length, initialize); + } + + @Override + public DataBuffer createByte(long length, boolean initialize) { + return new CudaByteDataBuffer(length, initialize); + } + + @Override + public DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaBfloat16DataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaUInt32DataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaUInt16DataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaUByteDataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaUInt64DataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaBoolDataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaShortDataBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace) { + return new CudaByteDataBuffer(length, initialize, workspace); + } + @Override public DataBuffer createInt(long length, boolean initialize) { return new CudaIntDataBuffer(length, initialize); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java index bcfa7b22d..3145fd8c2 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java @@ -2757,6 +2757,16 @@ public class CudaExecutioner extends DefaultOpExecutioner { return buffer; } + + @Override + public String runLightBenchmarkSuit(boolean printOut) { + return nativeOps.runLightBenchmarkSuit(printOut); + } + + @Override + public String runFullBenchmarkSuit(boolean printOut) { + return nativeOps.runFullBenchmarkSuit(printOut); + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index 3f7794074..4f11acb7c 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { */ public native int getDeviceMajor(int deviceId); + /** + * This method returns amount of cached memory + * @param deviceId + * @return + */ + public native @Cast("Nd4jLong") long getCachedMemory(int deviceId); + /** * * @param ptrToDeviceId @@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId); + public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer); public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer); public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer); public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer); @@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length); public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length); public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor); + + + public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut); + public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut); } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java index dd44914be..238209e88 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java @@ -2185,4 +2185,15 @@ public class NativeOpExecutioner extends DefaultOpExecutioner { sb.append(". Output var names: ").append(Arrays.toString(outNames)); } } + + + @Override + public String runLightBenchmarkSuit(boolean printOut) { + return loop.runLightBenchmarkSuit(printOut); + } + + @Override + public String runFullBenchmarkSuit(boolean printOut) { + return loop.runFullBenchmarkSuit(printOut); + } } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index fcde146d3..5cc1a46a0 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { */ public native int getDeviceMajor(int deviceId); + /** + * This method returns amount of cached memory + * @param deviceId + * @return + */ + public native @Cast("Nd4jLong") long getCachedMemory(int deviceId); + /** * * @param ptrToDeviceId @@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId); + public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer); public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer); public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer); public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer); @@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps { public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length); public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length); public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor); + + + public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut); + public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut); } diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java index 5688aa611..1915722b4 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java @@ -3306,6 +3306,28 @@ public class Nd4jTestsC extends BaseNd4jTest { log.info("arrayf data: {}", Arrays.toString(arrayf.data().asFloat())); } + @Test + public void testCreateDetached_1() { + val shape = new int[]{10}; + val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF}; + + for(DataType dt : dataTypes){ + val dataBuffer = Nd4j.createBufferDetached(shape, dt); + assertEquals(dt, dataBuffer.dataType()); + } + } + + @Test + public void testCreateDetached_2() { + val shape = new long[]{10}; + val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF}; + + for(DataType dt : dataTypes){ + val dataBuffer = Nd4j.createBufferDetached(shape, dt); + assertEquals(dt, dataBuffer.dataType()); + } + } + @Test public void testPairwiseMixedC() { int[] shape2 = {12, 8}; @@ -7889,6 +7911,7 @@ public class Nd4jTestsC extends BaseNd4jTest { assertEquals(Nd4j.createFromArray(1f, 3f, 4f), out); } + private static INDArray fwd(INDArray input, INDArray W, INDArray b){ INDArray ret = Nd4j.createUninitialized(input.size(0), W.size(1)); input.mmuli(W, ret); diff --git a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java index 1a2ec6f37..743f34655 100644 --- a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java +++ b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java @@ -355,6 +355,7 @@ public interface DataBufferFactory { DataBuffer create(DataType dataType, long length, boolean initialize, MemoryWorkspace workspace); + /** * Create an int data buffer * @@ -363,6 +364,33 @@ public interface DataBufferFactory { */ DataBuffer createInt(long length); + DataBuffer createBFloat16(long length); + DataBuffer createByte(long length); + DataBuffer createShort(long length); + DataBuffer createBool(long length); + DataBuffer createUShort(long length); + DataBuffer createUInt(long length); + DataBuffer createUByte(long length); + DataBuffer createULong(long length); + + DataBuffer createBFloat16(long length, boolean initialize); + DataBuffer createByte(long length, boolean initialize); + DataBuffer createShort(long length, boolean initialize); + DataBuffer createBool(long length, boolean initialize); + DataBuffer createUShort(long length, boolean initialize); + DataBuffer createUInt(long length, boolean initialize); + DataBuffer createUByte(long length, boolean initialize); + DataBuffer createULong(long length, boolean initialize); + + DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace); + DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace); + /** * Create an int data buffer, with optional initialization * diff --git a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java index 2bb49716e..96b154338 100644 --- a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java +++ b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java @@ -354,11 +354,132 @@ public class DefaultDataBufferFactory implements DataBufferFactory { return new IntBuffer(length); } + @Override + public DataBuffer createBFloat16(long length) { + return new BFloat16Buffer(length); + } + + @Override + public DataBuffer createUInt(long length) { + return new UInt32Buffer(length); + } + + @Override + public DataBuffer createUShort(long length) { + return new UInt16Buffer(length); + } + + @Override + public DataBuffer createUByte(long length) { + return new UInt8Buffer(length); + } + + @Override + public DataBuffer createULong(long length) { + return new UInt64Buffer(length); + } + + @Override + public DataBuffer createBool(long length) { + return new BoolBuffer(length); + } + + @Override + public DataBuffer createShort(long length) { + return new Int16Buffer(length); + } + + @Override + public DataBuffer createByte(long length) { + return new Int8Buffer(length); + } + + @Override + public DataBuffer createBFloat16(long length, boolean initialize) { + return new BFloat16Buffer(length, initialize); + } + + @Override + public DataBuffer createUInt(long length, boolean initialize) { + return new UInt32Buffer(length, initialize); + } + + @Override + public DataBuffer createUShort(long length, boolean initialize) { + return new UInt16Buffer(length, initialize); + } + + @Override + public DataBuffer createUByte(long length, boolean initialize) { + return new UInt8Buffer(length, initialize); + } + + @Override + public DataBuffer createULong(long length, boolean initialize) { + return new UInt64Buffer(length, initialize); + } + + @Override + public DataBuffer createBool(long length, boolean initialize) { + return new BoolBuffer(length, initialize); + } + + @Override + public DataBuffer createShort(long length, boolean initialize) { + return new Int16Buffer(length, initialize); + } + + @Override + public DataBuffer createByte(long length, boolean initialize) { + return new Int8Buffer(length, initialize); + } + @Override public DataBuffer createInt(long length, boolean initialize) { return new IntBuffer(length, initialize); } + @Override + public DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace) { + return new BFloat16Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace) { + return new UInt32Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace) { + return new UInt16Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace) { + return new UInt8Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace) { + return new UInt64Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace) { + return new BoolBuffer(length, initialize, workspace); + } + + @Override + public DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace) { + return new Int16Buffer(length, initialize, workspace); + } + + @Override + public DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace) { + return new Int8Buffer(length, initialize, workspace); + } + + @Override public DataBuffer createInt(long length, boolean initialize, MemoryWorkspace workspace) { return new IntBuffer(length, initialize, workspace); @@ -665,12 +786,12 @@ public class DefaultDataBufferFactory implements DataBufferFactory { @Override public DataBuffer createHalf(long length) { - throw new UnsupportedOperationException("FP16 isn't supported for CPU yet"); + return new HalfBuffer(length); } @Override public DataBuffer createHalf(long length, boolean initialize) { - throw new UnsupportedOperationException("FP16 isn't supported for CPU yet"); + return new HalfBuffer(length, initialize); } /**