diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index ad8a781a6..8537fc59c 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -247,6 +247,7 @@ if(CUDA_BLAS)
             endif()
         endif()
 
+        file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h)
         file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h)
         file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/impl/*.cpp ../include/execution/*.cu ../include/execution/*.h)
         file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h)
@@ -267,7 +268,7 @@ if(CUDA_BLAS)
                 ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
                 cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
                 Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES})
+                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
 		else()
             set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_TESTS=true")
 
@@ -276,7 +277,7 @@ if(CUDA_BLAS)
                 ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
                 cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
                 Environment.cpp Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES})
+                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
 		endif()
 
 
@@ -300,6 +301,7 @@ elseif(CPU_BLAS)
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
     endif()
 
+    file(GLOB_RECURSE PERF_SOURCES false ../include/performance/*.cpp ../include/performance/*.h)
     file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../include/exceptions/*.cpp ../include/exceptions/*.h)
     file(GLOB_RECURSE EXEC_SOURCES false ../include/execution/*.cpp ../include/execution/*.h)
     file(GLOB_RECURSE TYPES_SOURCES false ../include/types/*.cpp ../include/types/*.h)
@@ -320,7 +322,7 @@ elseif(CPU_BLAS)
             ../include/cnpy/cnpy.cpp  ../include/nd4jmemset.h ../include/nd4jmalloc.h
             Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
             ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES}
-            ${OPS_SOURCES})
+            ${OPS_SOURCES} ${PERF_SOURCES})
     if(IOS)
         add_library(${LIBND4J_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
     else()
diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h
index 1c818d528..f28a76836 100755
--- a/libnd4j/blas/NativeOps.h
+++ b/libnd4j/blas/NativeOps.h
@@ -759,6 +759,13 @@ public:
      */
     int getDeviceMajor(int deviceId);
 
+    /**
+     * This method returns amount of cached memory
+     * @param deviceId
+     * @return
+     */
+    Nd4jLong getCachedMemory(int deviceId);
+
     /**
      *
      * @param ptrToDeviceId
@@ -1653,6 +1660,7 @@ public:
 
     int unregisterGraph(Nd4jPointer *extraPointers, Nd4jLong graphId);
 
+    void deleteCharArray(Nd4jPointer pointer);
     void deleteIntArray(Nd4jPointer pointer);
     void deleteLongArray(Nd4jPointer pointer);
     void deletePointerArray(Nd4jPointer pointer);
@@ -1690,6 +1698,10 @@ public:
     nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, Nd4jLong *data, int length);
     nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, double *data, int length);
     nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor);
+
+
+    const char* runLightBenchmarkSuit(bool printOut);
+    const char* runFullBenchmarkSuit(bool printOut);
 };
 
 
diff --git a/libnd4j/blas/cpu/NDArrayFactory.cpp b/libnd4j/blas/cpu/NDArrayFactory.cpp
index 8fcd29eb7..ec99ef7db 100644
--- a/libnd4j/blas/cpu/NDArrayFactory.cpp
+++ b/libnd4j/blas/cpu/NDArrayFactory.cpp
@@ -204,6 +204,9 @@ template void NDArrayFactory::memcpyFromVector(void *ptr, const std::vector<int8
     template NDArray* NDArrayFactory::create_(const bool scalar, nd4j::LaunchContext * context);
     template NDArray* NDArrayFactory::create_(const int8_t scalar, nd4j::LaunchContext * context);
     template NDArray* NDArrayFactory::create_(const uint8_t scalar, nd4j::LaunchContext * context);
+    template NDArray* NDArrayFactory::create_(const uint16_t scalar, nd4j::LaunchContext * context);
+    template NDArray* NDArrayFactory::create_(const uint32_t scalar, nd4j::LaunchContext * context);
+    template NDArray* NDArrayFactory::create_(const uint64_t scalar, nd4j::LaunchContext * context);
     template NDArray* NDArrayFactory::create_(const int16_t scalar, nd4j::LaunchContext * context);
 
     template <typename T>
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index d281bdfac..460c9d4b6 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -72,6 +72,9 @@ bool experimentalSupport = false;
 #include <graph/ResultWrapper.h>
 #include <helpers/DebugHelper.h>
 #include <helpers/ConstantTadHelper.h>
+#include <performance/benchmarking/BenchmarkSuit.h>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <performance/benchmarking/LightBenchmarkSuit.h>
 
 using namespace nd4j;
 
@@ -2304,6 +2307,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) {
     delete[] ptr;
 }
 
+void NativeOps::deleteCharArray(Nd4jPointer pointer) {
+    auto ptr = reinterpret_cast<char *>(pointer);
+    delete[] ptr;
+}
+
 void NativeOps::deleteIntArray(Nd4jPointer pointer) {
     auto ptr = reinterpret_cast<int *>(pointer);
     delete[] ptr;
@@ -2792,6 +2800,38 @@ void NativeOps::sortTadByValue(Nd4jPointer *extraPointers,
     BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortTadByValue(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
 }
 
+const char* NativeOps::runLightBenchmarkSuit(bool printOut) {
+    nd4j::LightBenchmarkSuit suit;
+    auto result = suit.runSuit();
+
+    if (printOut)
+        nd4j_printf("%s\n", result.data());
+
+    auto chars = new char[result.length()+1];
+    std::memcpy(chars, result.data(), result.length());
+    chars[result.length()] = (char) 0x0;
+
+    return chars;
+}
+
+Nd4jLong NativeOps::getCachedMemory(int deviceId) {
+    return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
+}
+
+const char* NativeOps::runFullBenchmarkSuit(bool printOut) {
+    nd4j::FullBenchmarkSuit suit;
+    auto result = suit.runSuit();
+
+    if (printOut)
+        nd4j_printf("%s\n", result.data());
+
+    auto chars = new char[result.length()+1];
+    std::memcpy(chars, result.data(), result.length());
+    chars[result.length()] = (char) 0x0;
+
+    return chars;
+}
+
 
 BUILD_SINGLE_TEMPLATE(template void flattenGeneric,(Nd4jPointer*, int, char, void*, Nd4jLong*, void*, Nd4jLong*), LIBND4J_TYPES);
 BUILD_SINGLE_TEMPLATE(template void pullRowsGeneric, (void *, Nd4jLong*, void*, Nd4jLong*, const int, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*), LIBND4J_TYPES);
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index 4fa3a36fa..b56d5da94 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -47,6 +47,8 @@
 using namespace nd4j;
 
 #include <loops/special_kernels.h>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <performance/benchmarking/LightBenchmarkSuit.h>
 
 cudaDeviceProp *deviceProperties;
 cudaFuncAttributes *funcAttributes = new cudaFuncAttributes[64];
@@ -2804,6 +2806,11 @@ void NativeOps::deletePointerArray(Nd4jPointer pointer) {
     delete[] ptr;
 }
 
+void NativeOps::deleteCharArray(Nd4jPointer pointer) {
+    auto ptr = reinterpret_cast<char *>(pointer);
+    delete[] ptr;
+}
+
 void NativeOps::deleteIntArray(Nd4jPointer pointer) {
 	auto ptr = reinterpret_cast<int *>(pointer);
 	delete[] ptr;
@@ -3289,3 +3296,35 @@ Nd4jPointer NativeOps::shapeBufferForNumpy(Nd4jPointer npyArray) {
     }
     return reinterpret_cast<Nd4jPointer>(nd4j::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer, true));
 }
+
+const char* NativeOps::runLightBenchmarkSuit(bool printOut) {
+    nd4j::LightBenchmarkSuit suit;
+    auto result = suit.runSuit();
+
+    if (printOut)
+        nd4j_printf("%s\n", result.data());
+
+    auto chars = new char[result.length()+1];
+    std::memcpy(chars, result.data(), result.length());
+    chars[result.length()] = (char) 0x0;
+
+    return chars;
+}
+
+const char* NativeOps::runFullBenchmarkSuit(bool printOut) {
+    nd4j::FullBenchmarkSuit suit;
+    auto result = suit.runSuit();
+
+    if (printOut)
+        nd4j_printf("%s\n", result.data());
+
+    auto chars = new char[result.length()+1];
+    std::memcpy(chars, result.data(), result.length());
+    chars[result.length()] = (char) 0x0;
+
+    return chars;
+}
+
+Nd4jLong NativeOps::getCachedMemory(int deviceId) {
+    return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
+}
\ No newline at end of file
diff --git a/libnd4j/include/helpers/BenchmarkHelper.h b/libnd4j/include/helpers/BenchmarkHelper.h
index 7acea057d..58ed7e1b7 100644
--- a/libnd4j/include/helpers/BenchmarkHelper.h
+++ b/libnd4j/include/helpers/BenchmarkHelper.h
@@ -50,7 +50,7 @@ namespace nd4j {
         unsigned int _rIterations;
 
     protected:
-        void benchmarkOperation(OpBenchmark &benchmark);
+        std::string benchmarkOperation(OpBenchmark &benchmark);
 
         void benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z);
 
@@ -58,34 +58,30 @@ namespace nd4j {
 
         void benchmarkGEMM(char orderA, std::initializer_list<Nd4jLong> shapeA, char orderB, std::initializer_list<Nd4jLong> shapeB, char orderC, std::initializer_list<Nd4jLong> shapeC);
 
-        void printHeader();
+        std::string printHeader();
     public:
         BenchmarkHelper(unsigned int warmUpIterations = 10, unsigned int runIterations = 100);
 
-        void runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg = nullptr);
-        void runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg = nullptr);
+        std::string runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg = nullptr);
+        std::string runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg = nullptr);
+        std::string runOperationSuit(OpBenchmark* benchmark);
 
-        void runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
-        void runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
-        void runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
-        void runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
-        void runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
+        std::string runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
+        std::string runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
+        std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &)>& func, const char *message = nullptr);
+        std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
+        std::string runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet &, ResultSet &, ResultSet &)>& func, const char *message = nullptr);
 
 
-        void runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-        void runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
+        std::string runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
 
-        void runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
-
-
-        void runScalarSuit();
-
-        void runAllSuits();
+        std::string runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message = nullptr);
     };
 }
 
diff --git a/libnd4j/include/helpers/ConstantHelper.h b/libnd4j/include/helpers/ConstantHelper.h
index 3f31439c9..a7f7d0c00 100644
--- a/libnd4j/include/helpers/ConstantHelper.h
+++ b/libnd4j/include/helpers/ConstantHelper.h
@@ -44,6 +44,8 @@ namespace nd4j {
         std::vector<Nd4jPointer> _devicePointers;
         std::vector<Nd4jLong> _deviceOffsets;
         std::mutex _mutex;
+
+        std::vector<Nd4jLong> _counters;
     public:
         ~ConstantHelper() = default;
 
@@ -53,6 +55,8 @@ namespace nd4j {
         void* replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace = nullptr);
 
         ConstantDataBuffer* constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType);
+
+        Nd4jLong getCachedAmount(int deviceId);
     };
 }
 
diff --git a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
index 717623647..0aa8c35a6 100644
--- a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
@@ -36,7 +36,7 @@ namespace nd4j {
         nd4j::graph::Context *_context = nullptr;
     public:
         DeclarableBenchmark(nd4j::ops::DeclarableOp &op, std::string name = 0) : OpBenchmark() {
-            _op = ops::OpRegistrator::getInstance()->getOperation(op.getOpHash());
+            _op = &op; //ops::OpRegistrator::getInstance()->getOperation(op.getOpHash());
             _testName = name;
         }
 
diff --git a/libnd4j/include/helpers/cpu/ConstantHelper.cpp b/libnd4j/include/helpers/cpu/ConstantHelper.cpp
index 008e43bfe..f74bd5637 100644
--- a/libnd4j/include/helpers/cpu/ConstantHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantHelper.cpp
@@ -30,9 +30,11 @@ namespace nd4j {
     ConstantHelper::ConstantHelper() {
         int numDevices = getNumberOfDevices();
         _cache.resize(numDevices);
+        _counters.resize(numDevices);
         for (int e = 0; e < numDevices; e++) {
             std::map<ConstantDescriptor, ConstantHolder> map;
             _cache[e] = map;
+            _counters[e] = 0L;
         }
     }
 
@@ -44,8 +46,14 @@ namespace nd4j {
     }
 
     void* ConstantHelper::replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace) {
+        if (workspace == nullptr) {
+            auto deviceId = getCurrentDevice();
+            _counters[deviceId] += numBytes;
+        }
+
         int8_t *ptr = nullptr;
         ALLOCATE(ptr, workspace, numBytes, int8_t);
+
         std::memcpy(ptr, src, numBytes);
         return ptr;
     }
@@ -71,7 +79,9 @@ namespace nd4j {
         if (holder->hasBuffer(dataType))
             return holder->getConstantDataBuffer(dataType);
         else {
-            int8_t *cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)];
+            auto size = descriptor.length() * DataTypeUtils::sizeOf(dataType);
+            auto cbuff = new int8_t[size];
+            _counters[deviceId] += size;
 
             // create buffer with this dtype
             if (descriptor.isFloat()) {
@@ -87,6 +97,14 @@ namespace nd4j {
         }
     }
 
+    Nd4jLong ConstantHelper::getCachedAmount(int deviceId) {
+        int numDevices = getNumberOfDevices();
+        if (deviceId > numDevices || deviceId < 0)
+            return 0L;
+        else
+            return _counters[deviceId];
+    }
+
     nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
 }
 
diff --git a/libnd4j/include/helpers/cuda/ConstantHelper.cu b/libnd4j/include/helpers/cuda/ConstantHelper.cu
index cb96630a9..d0579b66d 100644
--- a/libnd4j/include/helpers/cuda/ConstantHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantHelper.cu
@@ -70,6 +70,7 @@ namespace nd4j {
         _devicePointers.resize(numDevices);
         _deviceOffsets.resize(numDevices);
         _cache.resize(numDevices);
+        _counters.resize(numDevices);
 
         // filling all pointers
         for (int e = 0; e < numDevices; e++) {
@@ -83,6 +84,7 @@ namespace nd4j {
             _devicePointers[e] = constant;
             _deviceOffsets[e] = 0;
             _cache[e] = devCache;
+            _counters[e] = 0L;
         }
 
         //
@@ -115,6 +117,7 @@ namespace nd4j {
             constantPtr = _devicePointers[deviceId];
             constantOffset = _deviceOffsets[deviceId];
         }
+
         if (constantOffset + numBytes >= CONSTANT_LIMIT) {
             int8_t *ptr = nullptr;
             ALLOCATE_SPECIAL(ptr, workspace, numBytes, int8_t);
@@ -154,7 +157,9 @@ namespace nd4j {
         if (holder->hasBuffer(dataType)) {
             return holder->getConstantDataBuffer(dataType);
         } else {
-            auto cbuff = new int8_t[descriptor.length() * DataTypeUtils::sizeOf(dataType)];
+            auto numBytes = descriptor.length() * DataTypeUtils::sizeOf(dataType);
+            auto cbuff = new int8_t[numBytes];
+            _counters[deviceId] += numBytes;
 
             // create buffer with this dtype
             if (descriptor.isFloat()) {
@@ -172,5 +177,13 @@ namespace nd4j {
         }
     }
 
+    Nd4jLong ConstantHelper::getCachedAmount(int deviceId) {
+        int numDevices = getNumberOfDevices();
+        if (deviceId > numDevices || deviceId < 0)
+            return 0L;
+        else
+            return _counters[deviceId];
+    }
+
     nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
index e92c7220f..cbe0c0729 100644
--- a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
+++ b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
@@ -30,11 +30,11 @@ namespace nd4j {
         _rIterations = runIterations;
     }
 
-    void BenchmarkHelper::printHeader() {
-        nd4j_printf("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n","");
+    std::string BenchmarkHelper::printHeader() {
+        return std::string("TestName\tOpNum\tWarmup\tNumIter\tDataType\tInplace\tShape\tStrides\tAxis\tOrders\tavg (us)\tmedian (us)\tmin (us)\tmax (us)\tstdev (us)\n");
     }
 
-    void BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) {
+    std::string BenchmarkHelper::benchmarkOperation(OpBenchmark &benchmark) {
 
         for (uint i = 0; i < _wIterations; i++)
             benchmark.executeOnce();
@@ -57,9 +57,9 @@ namespace nd4j {
         std::sort(timings.begin(), timings.end());
         Nd4jLong median = timings[_rIterations / 2];
 
-        NDArray n = NDArrayFactory::create(timings, LaunchContext::defaultContext());
+        auto n = NDArrayFactory::create(timings, LaunchContext::defaultContext());
 
-        double stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
+        auto stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
         auto min = n.reduceNumber(nd4j::reduce::Min).e<Nd4jLong>(0);
         auto max = n.reduceNumber(nd4j::reduce::Max).e<Nd4jLong>(0);
 
@@ -71,10 +71,16 @@ namespace nd4j {
         auto a = benchmark.axis();
         auto inpl = benchmark.inplace();
 
+        std::string temp;
+        temp.resize(65536);
+
         // printing out stuff
-        nd4j_printf("%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(),
+        snprintf(const_cast<char *>(temp.data()), temp.length(), "%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(),
                     _wIterations, _rIterations, t.c_str(), inpl.c_str(), s.c_str(), strides.c_str(), a.c_str(), o.c_str(),
                     nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
+
+        auto pos = temp.find('\n');
+        return temp.substr(0, pos + 1);
     }
 
     void BenchmarkHelper::benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z) {
@@ -126,47 +132,44 @@ namespace nd4j {
                     nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
     }
 
-    void BenchmarkHelper::runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg) {
+    std::string BenchmarkHelper::runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg) {
         std::vector<OpBenchmark*> ops(benchmarks);
-        runOperationSuit(ops, msg);
+        return runOperationSuit(ops, msg);
     }
 
-    void BenchmarkHelper::runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg) {
+    std::string BenchmarkHelper::runOperationSuit(OpBenchmark* benchmark) {
+        return benchmarkOperation(*benchmark);
+    }
+
+    std::string BenchmarkHelper::runOperationSuit(std::vector<OpBenchmark*> &benchmarks, bool postHeaders, const char *msg) {
+        std::string result;
+
         if (msg != nullptr && postHeaders) {
-            nd4j_printf("\n%s\n", msg);
+            result += "\n";
+            result += msg;
+            result += "\n";
         }
 
         if (postHeaders)
-            printHeader();
+            result += printHeader();
 
         for (auto v:benchmarks)
-            benchmarkOperation(*v);
+            result += benchmarkOperation(*v);
+
+        return result;
     }
 
-    void BenchmarkHelper::runScalarSuit() {
-        printHeader();
-
-        std::initializer_list<std::initializer_list<Nd4jLong>> shapes = {{100}, {32, 256}, {32, 150, 200}, {32, 3, 244, 244}, {32, 64, 128, 256}};
-        std::initializer_list<nd4j::DataType> dataTypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE};
-        std::initializer_list<nd4j::scalar::Ops> ops = {scalar::Add, scalar::Divide, scalar::Pow};
-
-        for (const auto &d:dataTypes) {
-            for (const auto &o:ops) {
-                for (const auto &s:shapes) {
-                    //benchmarkScalarOperation(o, 2.0, s, d);
-                }
-            }
-        }
-    }
-
-    void BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(DeclarableBenchmark *op, const std::function<Context* (Parameters &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string result;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            result += "\n";
+            result += message;
+            result += "\n";
         }
 
-        printHeader();
+        result += printHeader();
 
         std::vector<OpBenchmark*> list;
 
@@ -175,25 +178,26 @@ namespace nd4j {
 
             auto clone = reinterpret_cast<DeclarableBenchmark*>(op->clone());
             clone->setContext(ctx);
-            list.emplace_back(clone);
+
+            result += runOperationSuit(clone);
+
+            delete clone;
         }
 
-        runOperationSuit(list, false);
-
-        // removing everything
-        for (auto v:list) {
-            delete reinterpret_cast<DeclarableBenchmark*>(v);
-        }
+        return result;
     }
 
-    void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -217,16 +221,20 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<ScalarBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ScalarBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+        std::string output;
+
         ResultSet x;
         x.setNonRemovable();
         ResultSet z;
@@ -248,23 +256,27 @@ namespace nd4j {
             result.emplace_back(clone);
         }
 
-        runOperationSuit(result, message);
+        output += runOperationSuit(result, message);
 
         // removing everything
         for (auto v:result) {
             delete reinterpret_cast<ScalarBenchmark*>(v);
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
-
+    std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (Parameters &, ResultSet &, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -288,16 +300,20 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<TransformBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(TransformBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+        std::string output;
+
         ResultSet x;
         x.setNonRemovable();
         ResultSet z;
@@ -319,22 +335,27 @@ namespace nd4j {
             result.emplace_back(clone);
         }
 
-        runOperationSuit(result, message);
+        output += runOperationSuit(result, message);
 
         // removing everything
         for (auto v:result) {
             delete reinterpret_cast<TransformBenchmark*>(v);
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&)>& func, ParametersBatch &parametersBatch, const char *message) {
+        std::string output;
         auto parameters = parametersBatch.parameters();
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -358,16 +379,19 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<ReductionBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&)>& func, const char *message) {
+        std::string output;
         ResultSet x;
         x.setNonRemovable();
         ResultSet z;
@@ -389,19 +413,24 @@ namespace nd4j {
             result.emplace_back(clone);
         }
 
-        runOperationSuit(result, message);
+        output += runOperationSuit(result, message);
 
         // removing everything
         for (auto v:result) {
             delete reinterpret_cast<ReductionBenchmark*>(v);
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
         printHeader();
@@ -436,16 +465,20 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<ReductionBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(ReductionBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
+        std::string output;
+
         ResultSet x;
         x.setNonRemovable();
         ResultSet y;
@@ -474,22 +507,27 @@ namespace nd4j {
             result.emplace_back(clone);
         }
 
-        runOperationSuit(result, message);
+        output += runOperationSuit(result, message);
 
         // removing everything
         for (auto v:result) {
             delete reinterpret_cast<ReductionBenchmark*>(v);
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(BroadcastBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -518,23 +556,28 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<BroadcastBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -562,16 +605,20 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<PairwiseBenchmark*>(v);
             }
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(PairwiseBenchmark *op, const std::function<void (ResultSet&, ResultSet&, ResultSet &)>& func, const char *message) {
+        std::string output;
+
         ResultSet x;
         x.setNonRemovable();
         ResultSet y;
@@ -597,22 +644,27 @@ namespace nd4j {
             result.emplace_back(clone);
         }
 
-        runOperationSuit(result, message);
+        output += runOperationSuit(result, message);
 
         // removing everything
         for (auto v:result) {
             delete reinterpret_cast<PairwiseBenchmark*>(v);
         }
+
+        return output;
     }
 
-    void BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
+    std::string BenchmarkHelper::runOperationSuit(MatrixBenchmark *op, const std::function<void (Parameters &, ResultSet&, ResultSet&, ResultSet &)>& func, ParametersBatch &parametersBatch, const char *message) {
         auto parameters = parametersBatch.parameters();
+        std::string output;
 
         if (message != nullptr) {
-            nd4j_printf("\n%s\n", message);
+            output += "\n";
+            output += message;
+            output += "\n";
         }
 
-        printHeader();
+        output += printHeader();
 
         for (auto &p: parameters) {
             ResultSet x;
@@ -637,12 +689,14 @@ namespace nd4j {
                 result.emplace_back(clone);
             }
 
-            runOperationSuit(result, false);
+            output += runOperationSuit(result, false);
 
             // removing everything
             for (auto v:result) {
                 delete reinterpret_cast<MatrixBenchmark*>(v);
             }
         }
+
+        return output;
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
index debca0053..4cb33d8d4 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
@@ -42,13 +42,13 @@ CUSTOM_OP_IMPL(lstmBlock, 9, 7, false, 2, 2) {
     auto b    = INPUT_VARIABLE(8);                    // biases, [4*numUnits]
 
     auto i   =  OUTPUT_VARIABLE(0);                   // Output - input modulation gate activations [seqLen, bS, numUnits]
-    auto c   =  OUTPUT_VARIABLE(1);                      // Activations, cell state (pre tanh) [seqLen, bs, numUnits]
+    auto c   =  OUTPUT_VARIABLE(1);                   // Activations, cell state (pre tanh) [seqLen, bs, numUnits]
     auto f   =  OUTPUT_VARIABLE(2);                   // Output - forget gate activations [seqLen, bs, numUnits]
     auto o   =  OUTPUT_VARIABLE(3);                   // Output - output gate activations [seqLen, bs, numUnits]
     auto z   =  OUTPUT_VARIABLE(4);                   // Output - input gate activations [seqLen, bs, numUnits]
     auto h   =  OUTPUT_VARIABLE(5);                   // Cell state, post tanh [seqLen, bs, numUnits]
     auto y   =  OUTPUT_VARIABLE(6);                   // current cell output [seqLen, bS, numProj], time t
-    
+
     const int peephole   = INT_ARG(0);                     // if 1, provide peephole connections
     const int dataFormat = INT_ARG(1);                     // 0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]; 2=NTS=[mb,seqLen,size]
     const double forgetBias   = T_ARG(0);
@@ -117,7 +117,7 @@ DECLARE_SHAPE_FN(lstmBlock) {
 
     //7 outputs, all same shape/type
     return SHAPELIST(s1, s1, s1, s1, s1, s1, s1);
-}   
+}
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index f2e4e77bc..1a43fb250 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... axis+indices->rankOf()-1
             const Nd4jLong numOfSubArrs = indices->lengthOf();
-PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
             for(int i = 0; i < numOfSubArrs; ++i) {
                 NDArray subArrOut = (*output)(i, dimsOut);
                 NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
@@ -72,7 +72,7 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->eleme
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = intArgs.size() - 1;
-PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
             for(int i = 0; i < numOfSubArrs; ++i) {
                 NDArray subArrOut = (*output)(i, {axis});
                 NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index f0f5697d0..de4a0b08d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -171,15 +171,8 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast
     const int numUnits    = cLast->sizeAt(1);
 
     //Concat inputs: [xt, yt-1]: concat([bs,nIn],[bs,nOut]) -> [bs, (nIn+nOut)]
-    nd4j::ops::concat concat;
-    Context cContext(119);
     auto concatOut = NDArrayFactory::create(xt->ordering(), {xt->sizeAt(0), xt->sizeAt(1) + yLast->sizeAt(1)}, xt->dataType(), xt->getContext());
-    cContext.setInputArray(0, const_cast<NDArray*>(xt), false);
-    cContext.setInputArray(1, const_cast<NDArray*>(yLast), false);
-    cContext.setOutputArray(0, &concatOut, false);
-    cContext.getIArguments()->emplace_back(1);
-
-    concat.execute(&cContext);
+    helpers::concat(xt->getContext(), {const_cast<NDArray*>(xt), const_cast<NDArray*>(yLast)}, concatOut, {1});
 
     //NDArray* NDArrayFactory::create_( const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dataType, nd4j::memory::Workspace* workspace) {
     std::vector<Nd4jLong> shape = {bS, 4*numUnits};
diff --git a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
index d115f3fd0..da2175a36 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
@@ -45,10 +45,26 @@ namespace nd4j {
                                    const NDArray* iSeq, const NDArray* cSeq, const NDArray* fSeq, const NDArray* oSeq, const NDArray* zSeq,
                                    const NDArray* hSeq, const NDArray* ySeq, const std::vector<double>& params, const int dataFormat){
 
-                const int seqLen = xSeq->sizeAt(0);
-                const int mb = xSeq->sizeAt(1);
-                const int inSize = xSeq->sizeAt(2);
-                const int outSize = iSeq->sizeAt(2);
+                int seqLen, mb, inSize, outSize;
+
+                if(dataFormat == 0) {
+                    seqLen  = xSeq->sizeAt(0);
+                    mb      = xSeq->sizeAt(1);
+                    inSize  = xSeq->sizeAt(2);
+                    outSize = iSeq->sizeAt(2);
+                }
+                else if(dataFormat == 1) {
+                    seqLen  = xSeq->sizeAt(2);
+                    mb      = xSeq->sizeAt(0);
+                    inSize  = xSeq->sizeAt(1);
+                    outSize = iSeq->sizeAt(1);
+                }
+                else if(dataFormat == 2) {
+                    seqLen  = xSeq->sizeAt(1);
+                    mb      = xSeq->sizeAt(0);
+                    inSize  = xSeq->sizeAt(2);
+                    outSize = iSeq->sizeAt(2);
+                }
 
                 const std::vector<Nd4jLong> inSliceShape({mb,inSize});
                 const std::vector<Nd4jLong> outSliceShape({mb,outSize});
diff --git a/libnd4j/include/performance/benchmarking/BenchmarkSuit.h b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h
new file mode 100644
index 000000000..1a77dbd9f
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef LIBND4J_BENCHMARKSUIT_H
+#define LIBND4J_BENCHMARKSUIT_H
+
+#include <string>
+#include <pointercast.h>
+#include <dll.h>
+#include <BenchmarkHelper.h>
+#include <NDArrayFactory.h>
+
+namespace nd4j {
+    class ND4J_EXPORT BenchmarkSuit {
+    public:
+        BenchmarkSuit() = default;
+        ~BenchmarkSuit() = default;
+
+        virtual std::string runSuit() = 0;
+    };
+}
+
+
+#endif //DEV_TESTS_BENCHMARKSUIT_H
diff --git a/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h
new file mode 100644
index 000000000..dc2b63a4d
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h
@@ -0,0 +1,34 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef LIBND4J_FULLBENCHMARKSUIT_H
+#define LIBND4J_FULLBENCHMARKSUIT_H
+
+#include <performance/benchmarking/BenchmarkSuit.h>
+
+namespace nd4j {
+    class FullBenchmarkSuit : public BenchmarkSuit {
+    public:
+        std::string runSuit() override;
+    };
+}
+
+
+#endif //DEV_TESTS_FULLBENCHMARKSUIT_H
diff --git a/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h
new file mode 100644
index 000000000..35215d032
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h
@@ -0,0 +1,34 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef LIBND4J_LIGHTBENCHMARKSUIT_H
+#define LIBND4J_LIGHTBENCHMARKSUIT_H
+
+#include <performance/benchmarking/BenchmarkSuit.h>
+
+namespace nd4j {
+    class LightBenchmarkSuit : public BenchmarkSuit {
+    public:
+        std::string runSuit() override;
+    };
+}
+
+
+#endif //DEV_TESTS_LIGHTBENCHMARKSUIT_H
diff --git a/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp
new file mode 100644
index 000000000..902480092
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/impl/BenchmarkSuit.cpp
@@ -0,0 +1,20 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+#include <performance/benchmarking/BenchmarkSuit.h>
diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
new file mode 100644
index 000000000..40ecb6214
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
@@ -0,0 +1,1921 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <ops/declarable/CustomOperations.h>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <ops/declarable/LegacyRandomOp.h>
+
+#ifdef _RELEASE
+    int wIterations = 4;
+    int rIterations = 20;
+    int gemmRegularUpperPow = 11;
+    int scalarBenchmarkPowLimit = 26;
+    int transformBenchmarkPowLimit = 26;
+    int intermediateTransformPowLimit = 22;
+    int intermediateTransformPowLimit2 = 18;
+    int pairwisePowLimit = 26;
+    int heavyPowLimit = 22;
+    int nonEwsPowLimit = 10;
+    int reduceScalarPowLimit = 26;
+    int stridedReductionPowLimit = 20;
+    int mismatchedAssignPowLimit = 26;
+    int gatherOpPowLimit = 18;
+    int gatherOpPowLimit2 = 16;
+    int gatherOpPowLimit3 = 12;
+    int broadcastMatrixRankLimit = 5;
+    int limit30 = 30;
+    int limit26 = 26;
+    int limit24 = 24;
+    int limit22 = 22;
+    int limit20 = 20;
+    int limit18 = 18;
+    int limit10 = 10;
+    int limit5 = 5;
+    int limit3 = 3;
+#else
+    int wIterations = 0;
+    int rIterations = 1;
+    int gemmRegularUpperPow = 7;
+    int scalarBenchmarkPowLimit = 10;
+    int transformBenchmarkPowLimit = 10;
+    int intermediateTransformPowLimit = 10;
+    int intermediateTransformPowLimit2 = 10;
+    int pairwisePowLimit = 10;
+    int heavyPowLimit = 10;
+    int nonEwsPowLimit = 6;
+    int reduceScalarPowLimit = 10;
+    int stridedReductionPowLimit = 12;
+    int mismatchedAssignPowLimit = 2;
+    int gatherOpPowLimit = 10;
+    int gatherOpPowLimit2 = 8;
+    int gatherOpPowLimit3 = 8;
+    int broadcastMatrixRankLimit = 3;
+    int limit26 = 8;
+    int limit24 = 8;
+    int limit22 = 8;
+    int limit20 = 8;
+    int limit18 = 8;
+    int limit10 = 4;
+    int limit5 = 3;
+    int limit3 = 1;
+#endif
+
+namespace nd4j {
+
+    static std::string layerNormBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        BoolParameters nhwc("nhwc");    //0 = nchw
+
+#ifdef _RELEASE
+        int c = 32;
+        int hw = 64;
+#else
+        int c = 3;
+        int hw = 8;
+#endif
+
+        ParametersBatch batch({&nhwc});
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+
+            int axis;
+            if (n == 0) {
+                //nchw
+                auto input = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
+                auto output = NDArrayFactory::create_<float>('c', {16, c, hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+                axis = 1;
+            } else {
+                auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
+                auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, c});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+                axis = 3;
+            }
+
+            auto bias = NDArrayFactory::create_<float>('c', {c});
+            ctx->setInputArray(1, bias, true);
+            auto iargs = new Nd4jLong[1];
+            iargs[0] = axis;
+            ctx->setIArguments(iargs, 1);
+            delete[] iargs;
+
+            return ctx;
+        };
+
+        nd4j::ops::layer_norm layerNorm;
+        DeclarableBenchmark benchmark(layerNorm, "layer norm");
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Layer Norm");
+
+        return output;
+    }
+
+
+    static std::string maxPool3DBenchmark(){
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        BoolParameters ncdhw("ncdhw");  //1 = ndhwc
+        ParametersBatch batch({&ncdhw});
+
+        nd4j::ops::maxpool3dnew maxpool3Dnew;
+        DeclarableBenchmark benchmark(maxpool3Dnew, "maxPool3d");
+
+#ifdef _RELEASE
+        int mb = 16;
+        int chIn = 16;
+        int chOut = 16;
+        int dhw = 64;
+#else
+        int mb = 1;
+        int chIn = 3;
+        int chOut = 3;
+        int dhw = 16;
+#endif
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int format = p.getIntParam("ncdhw");
+
+            //Set inputs and outputs
+            //Same mode + stride 1: output is same shape as input
+            if(format == 1) {
+                //NDHWC
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
+            } else {
+                //NCDHW
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
+            }
+
+            auto iargs = new Nd4jLong[15];
+            //Kernel, strides, padding, dilation - x3 each
+            iargs[0] = 3;   //Kernel
+            iargs[1] = 3;
+            iargs[2] = 3;
+            iargs[3] = 1;   //Stride
+            iargs[4] = 1;
+            iargs[5] = 1;
+            iargs[6] = 0;   //Padding
+            iargs[7] = 0;
+            iargs[8] = 0;
+            iargs[9] = 1;   //Dilation
+            iargs[10] = 1;
+            iargs[11] = 1;
+            iargs[12] = 1;  //Same mode
+            iargs[13] = 0;  //Unused for max
+            iargs[14] = format; //0 = ncdhw
+            ctx->setIArguments(iargs, 14);
+            delete[] iargs;
+
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "maxPool3d");
+        return output;
+    }
+
+
+    static std::string conv3dBenchmark(){
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        BoolParameters ncdhw("ncdhw");  //1 = ndhwc
+        ParametersBatch batch({&ncdhw});
+
+        nd4j::ops::conv3dnew conv3Dnew;
+        DeclarableBenchmark benchmark(conv3Dnew, "conv3d");
+
+#ifdef _RELEASE
+        int mb = 16;
+        int chIn = 16;
+        int chOut = 16;
+        int dhw = 64;
+#else
+        int mb = 1;
+        int chIn = 3;
+        int chOut = 3;
+        int dhw = 16;
+#endif
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int format = p.getIntParam("ncdhw");
+
+            //Set inputs and outputs
+            //Same mode + stride 1: output is same shape as input
+            if(format == 1) {
+                //NDHWC
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, dhw, dhw, dhw, chIn}), true);
+            } else {
+                //NCDHW
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {mb, chIn, dhw, dhw, dhw}), true);
+            }
+
+            //Weights and bias:
+            ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {3, 3, 3, chIn, chOut}), true);
+            ctx->setInputArray(2, NDArrayFactory::create_<float>('c', {chOut}), true);
+
+
+            auto iargs = new Nd4jLong[14];
+            //Kernel, strides, padding, dilation - x3 each
+            iargs[0] = 3;   //Kernel
+            iargs[1] = 3;
+            iargs[2] = 3;
+            iargs[3] = 1;   //Stride
+            iargs[4] = 1;
+            iargs[5] = 1;
+            iargs[6] = 0;   //Padding
+            iargs[7] = 0;
+            iargs[8] = 0;
+            iargs[9] = 1;   //Dilation
+            iargs[10] = 1;
+            iargs[11] = 1;
+            iargs[12] = 1;  //Same mode
+            iargs[13] = format; //0 = ncdhw
+            ctx->setIArguments(iargs, 14);
+            delete[] iargs;
+
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "CNN3D");
+        return output;
+    }
+
+
+    static std::string lstmBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        BoolParameters format("format");    //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
+#ifdef _RELEASE
+        PredefinedParameters mb("mb", {1, 8, 64});
+        PredefinedParameters nInOut("nInOut", {32, 256, 1024});
+#else
+        PredefinedParameters mb("mb", {1});
+        PredefinedParameters nInOut("nInOut", {32});
+#endif
+
+        ParametersBatch batch({&format, &mb, &nInOut});
+        nd4j::ops::lstmBlock lstmBlock;
+        DeclarableBenchmark benchmark(lstmBlock, "lstm");
+
+        int seqLength = 32;
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int f = p.getIntParam("format");
+            int m = p.getIntParam("mb");
+            int n = p.getIntParam("nInOut");
+
+            Nd4jLong l = 0;
+            ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l), true);  //Max TS length (unused)
+
+
+            if (f == 0) {
+                //TNS format
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);     //x
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //i
+                ctx->setOutputArray(1, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //c
+                ctx->setOutputArray(2, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //f
+                ctx->setOutputArray(3, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //o
+                ctx->setOutputArray(4, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //z
+                ctx->setOutputArray(5, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //h
+                ctx->setOutputArray(6, NDArrayFactory::create_<float>('c', {seqLength, m, n}), true);    //y
+            } else {
+                //NST format
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);     //x
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //i
+                ctx->setOutputArray(1, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //c
+                ctx->setOutputArray(2, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //f
+                ctx->setOutputArray(3, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //o
+                ctx->setOutputArray(4, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //z
+                ctx->setOutputArray(5, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //h
+                ctx->setOutputArray(6, NDArrayFactory::create_<float>('f', {m, n, seqLength}), true);    //y
+            }
+
+            auto cLast = NDArrayFactory::create_<float>('c', {m, n});
+            auto yLast = NDArrayFactory::create_<float>('c', {m, n});
+            auto W = NDArrayFactory::create_<float>('c', {2 * n, 4 * n});
+            auto Wci = NDArrayFactory::create_<float>('c', {n});
+            auto Wcf = NDArrayFactory::create_<float>('c', {n});
+            auto Wco = NDArrayFactory::create_<float>('c', {n});
+            auto b = NDArrayFactory::create_<float>('c', {4 * n});
+
+            ctx->setInputArray(2, cLast, true);
+            ctx->setInputArray(3, yLast, true);
+            ctx->setInputArray(4, W, true);
+            ctx->setInputArray(5, Wci, true);
+            ctx->setInputArray(6, Wcf, true);
+            ctx->setInputArray(7, Wco, true);
+            ctx->setInputArray(8, b, true);
+
+            auto iargs = new Nd4jLong[2];
+            iargs[0] = 0;   //No peephole
+            iargs[1] = f;
+            ctx->setIArguments(iargs, 2);
+            delete[] iargs;
+
+            auto targs = new double[2];
+            targs[0] = 1.0; //forget bias
+            targs[1] = 0.0; //cell clipping value
+            ctx->setTArguments(targs, 2);
+            delete[] targs;
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
+        return output;
+    }
+
+    static std::string batchnormBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Convolution2D op
+        BoolParameters nhwc("nhwc");
+#ifdef _RELEASE
+        PredefinedParameters c("c", {3, 32, 128});
+        PredefinedParameters hw("hw", {32, 128});
+#else
+        PredefinedParameters c("c", {3});
+        PredefinedParameters hw("hw", {16});
+#endif
+
+        ParametersBatch batch({&nhwc, &c, &hw});
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+            int hw = p.getIntParam("hw");
+            int ch = p.getIntParam("c");
+
+            auto args = new Nd4jLong[3];
+            args[0] = args[1] = 1; //apply scale and offset
+            if (n == 0) {
+                auto input = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
+                auto output = NDArrayFactory::create_<float>('c', {32, ch, hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+                args[2] = 1;    //axis
+            } else {
+                auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
+                auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, ch});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+                args[2] = 3;    //axis
+            }
+            ctx->setIArguments(args, 3);
+            delete[] args;
+
+            ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {ch}), true);   //mean
+            auto v = NDArrayFactory::create_<float>('c', {ch});
+            v->assign(1.0f);
+            ctx->setInputArray(2, v, true);   //variance
+            auto g = NDArrayFactory::create_<float>('c', {ch});
+            g->assign(1.0);
+            ctx->setInputArray(3, g, true);   //gamma
+            auto b = NDArrayFactory::create_<float>('c', {ch});
+            b->assign(1.0);
+            ctx->setInputArray(4, b, true);   //beta
+
+            auto targs = new double[1];
+            targs[0] = 1e-5;
+            ctx->setTArguments(targs, 1);
+            delete[] targs;
+
+            return ctx;
+        };
+
+        nd4j::ops::batchnorm_new batchnorm;
+        DeclarableBenchmark benchmark(batchnorm, "batchnorm");
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Batch Normalization");
+
+        return output;
+    }
+
+    static std::string pool2dBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Convolution2D op
+        BoolParameters nhwc("nhwc");
+#ifdef _RELEASE
+        PredefinedParameters k("k", {2, 3, 5});
+        PredefinedParameters c("c", {3, 32, 128});
+        PredefinedParameters hw("hw", {32, 128});
+#else
+        PredefinedParameters k("k", {2});
+        PredefinedParameters c("c", {3});
+        PredefinedParameters hw("hw", {8});
+#endif
+
+        ParametersBatch batch({&nhwc, &k, &c, &hw});
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+            int hw = p.getIntParam("hw");
+            int khw = p.getIntParam("k");
+
+            if (n == 0) {
+                auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
+                auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            } else {
+                auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
+                auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            }
+
+            auto args = new Nd4jLong[11];
+            args[0] = args[1] = khw; //Kernel
+            args[2] = args[3] = 1;//Stride
+            args[4] = args[5] = 0;  //Pad
+            args[6] = args[7] = 1;  //Dilation
+            args[8] = 1;     //SAME
+            args[9] = 0;     //Divisor mode - 0 = exclude padding in divisor
+            args[10] = n;//0-nchw, 1=nhwc
+            ctx->setIArguments(args, 11);
+            delete[] args;
+
+            return ctx;
+        };
+
+        nd4j::ops::avgpool2d avgpool2d;
+        DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
+        output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pooling 2d Operation");
+
+        nd4j::ops::maxpool2d maxpool2d;
+        DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
+        output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pooling 2d Operation");
+        return output;
+    }
+
+    static std::string conv2dBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Convolution2D op
+        BoolParameters nhwc("nhwc");
+#ifdef _RELEASE
+        PredefinedParameters k("k", {2, 3, 5});
+        PredefinedParameters c("c", {3, 32, 128});
+        PredefinedParameters hw("hw", {32, 128});
+#else
+        PredefinedParameters k("k", {2});
+        PredefinedParameters c("c", {3});
+        PredefinedParameters hw("hw", {8});
+#endif
+        ParametersBatch batch({&nhwc, &k, &c, &hw});
+        nd4j::ops::conv2d conv2d;
+        DeclarableBenchmark benchmark(conv2d, "conv2d");
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+            int hw = p.getIntParam("hw");
+            int khw = p.getIntParam("k");
+
+            if (n == 0) {
+                auto input = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
+                auto output = NDArrayFactory::create_<float>('c', {32, p.getIntParam("c"), hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            } else {
+                auto input = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
+                auto output = NDArrayFactory::create_<float>('c', {32, hw, hw, p.getIntParam("c")});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            }
+
+            auto b = NDArrayFactory::create_<float>('c', {p.getIntParam("c")});
+            auto w = NDArrayFactory::create_<float>('c', {khw, khw, p.getIntParam("c"), p.getIntParam("c")});   // [kH, kW, iC, oC] always
+
+            ctx->setInputArray(1, w, true);
+            ctx->setInputArray(2, b, true);
+
+            auto args = new Nd4jLong[10];
+            args[0] = args[1] = khw; //Kernel
+            args[2] = args[3] = 1;//Stride
+            args[4] = args[5] = 0;  //Pad
+            args[6] = args[7] = 1;  //Dilation
+            args[8] = 1;     //SAME
+            args[9] = n;//0-nchw, 1=nhwc
+            ctx->setIArguments(args, 10);
+            delete[] args;
+
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d Operation");
+        return output;
+    }
+
+    static std::string rngBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+        //Uniform, gaussian and bernoulli RNG generation
+
+        IntPowerParameters length("length", 2, 4, scalarBenchmarkPowLimit, 3);      //2^8 to 2^30 in steps of 3
+
+        ParametersBatch batch({&length});
+
+        auto gen01 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true);   //Shape as NDArray
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
+            auto d = new double[2];
+            d[0] = 0.0;
+            d[1] = 1.0;
+            ctx->setTArguments(d, 2);
+            delete[] d;
+            return ctx;
+        };
+
+        auto gen05 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {2},{1, p.getIntParam("length")}), true);   //Shape as NDArray
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {1, p.getIntParam("length")}), true);
+            auto d = new double[1];
+            d[0] = 0.5;
+            ctx->setTArguments(d, 1);
+            delete[] d;
+            return ctx;
+        };
+
+        nd4j::ops::LegacyRandomOp unif(random::UniformDistribution);
+        DeclarableBenchmark dbU(unif, "uniform");
+        output += helper.runOperationSuit(&dbU, gen01, batch, "Uniform Distribution");
+
+        nd4j::ops::LegacyRandomOp gaussian(random::GaussianDistribution);
+        DeclarableBenchmark dbG(gaussian, "gaussian");
+        output += helper.runOperationSuit(&dbG, gen01, batch, "Gaussian Distribution");
+
+        nd4j::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution);
+        DeclarableBenchmark dbTU(unif, "trunc.norm");
+        output += helper.runOperationSuit(&dbTU, gen01, batch, "Truncated Normal Distribution");
+
+        nd4j::ops::LegacyRandomOp ln(random::LogNormalDistribution);
+        DeclarableBenchmark dbLN(ln, "uniform");
+        output += helper.runOperationSuit(&dbLN, gen01, batch, "Log Normal Distribution");
+
+        nd4j::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution);
+        DeclarableBenchmark dbB(bernoulli, "bernoulli");
+        output += helper.runOperationSuit(&dbB, gen05, batch, "Bernoulli Distribution");
+
+        nd4j::ops::LegacyRandomOp dropout(random::BernoulliDistribution);
+        DeclarableBenchmark dbD(dropout, "dropout");
+        output += helper.runOperationSuit(&dbD, gen05, batch, "Dropout");
+
+        return output;
+    }
+
+    static std::string gemmIrregularBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Basically the same as above, but with irregular shapes (not multiples of 8, etc)
+
+#ifdef _RELEASE
+        int tAMax = 1;
+        int tBMax = 1;
+        int b = 1024;
+        int c = 1024;
+#else
+        int tAMax = 1;
+        int tBMax = 1;
+        int b = 32;
+        int c = 32;
+#endif
+
+        for (int tA = 0; tA <= tAMax; tA++) {
+            for (int tB = 0; tB <= tBMax; tB++) {
+                IntParameters d("d", 1020, 1028, 1);     //1020, 1021, ..., 1028
+                ParametersBatch dim({&d});
+
+                //Vary A.rows:
+                auto generator = PARAMETRIC_XYZ() {
+                    auto a = p.getIntParam("d");
+                    std::vector<Nd4jLong> shapeA;
+                    std::vector<Nd4jLong> shapeB;
+                    if (tA) {
+                        shapeA = {b, a};
+                    } else {
+                        shapeA = {a, b};
+                    }
+                    if (tB) {
+                        shapeB = {c, b};
+                    } else {
+                        shapeB = {b, c};
+                    }
+                    auto A = NDArrayFactory::create_<float>('c', shapeA);
+                    auto B = NDArrayFactory::create_<float>('c', shapeB);
+                    auto C = NDArrayFactory::create_<float>('f', {a, c});
+
+                    x.push_back(A);
+                    y.push_back(B);
+                    z.push_back(C);
+                };
+
+                std::string n;
+                n += "Gemm (a.rows) - tA=";
+                n += std::to_string(tA);
+                n += ", tB=";
+                n += std::to_string(tB);
+
+                MatrixBenchmark mb(1.0, 0.0, tA, tB, n);
+
+                output += helper.runOperationSuit(&mb, generator, dim, n.c_str());
+
+                //Vary A.columns / B.rows
+                auto generator2 = PARAMETRIC_XYZ() {
+                    auto a = 1024;
+                    auto b = p.getIntParam("d");
+                    auto c = 1024;
+                    std::vector<Nd4jLong> shapeA;
+                    std::vector<Nd4jLong> shapeB;
+                    if (tA) {
+                        shapeA = {b, a};
+                    } else {
+                        shapeA = {a, b};
+                    }
+                    if (tB) {
+                        shapeB = {c, b};
+                    } else {
+                        shapeB = {b, c};
+                    }
+                    auto A = NDArrayFactory::create_<float>('c', shapeA);
+                    auto B = NDArrayFactory::create_<float>('c', shapeB);
+                    auto C = NDArrayFactory::create_<float>('f', {a, c});
+
+                    x.push_back(A);
+                    y.push_back(B);
+                    z.push_back(C);
+                };
+
+                std::string n2;
+                n2 += "Gemm (a.columns) - tA=";
+                n2 += std::to_string(tA);
+                n2 += ", tB=";
+                n2 += std::to_string(tB);
+
+                MatrixBenchmark mb2(1.0, 0.0, tA, tB, n2);
+
+                output += helper.runOperationSuit(&mb2, generator2, dim, n2.c_str());
+
+                //Vary A.columns / B.rows
+                auto generator3 = PARAMETRIC_XYZ() {
+                    auto a = 1024;
+                    auto b = 1024;
+                    auto c = p.getIntParam("d");
+                    std::vector<Nd4jLong> shapeA;
+                    std::vector<Nd4jLong> shapeB;
+                    if (tA) {
+                        shapeA = {b, a};
+                    } else {
+                        shapeA = {a, b};
+                    }
+                    if (tB) {
+                        shapeB = {c, b};
+                    } else {
+                        shapeB = {b, c};
+                    }
+                    auto A = NDArrayFactory::create_<float>('c', shapeA);
+                    auto B = NDArrayFactory::create_<float>('c', shapeB);
+                    auto C = NDArrayFactory::create_<float>('f', {a, c});
+
+                    x.push_back(A);
+                    y.push_back(B);
+                    z.push_back(C);
+                };
+
+                std::string n3;
+                n3 += "Gemm (b.columns) - tA=";
+                n3 += std::to_string(tA);
+                n3 += ", tB=";
+                n3 += std::to_string(tB);
+
+                MatrixBenchmark mb3(1.0, 0.0, tA, tB, n);
+
+                output += helper.runOperationSuit(&mb3, generator3, dim, n3.c_str());
+            }
+        }
+
+        return output;
+    }
+
+    static std::string batchGemmBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Rank 3 - [32,1024,1024]x[32,1024,1024]
+        //Rank 4 - [4,8,1024,1024]x[4,8,1024,1024]
+
+        IntParameters rank("rank", 3, 4, 1);
+
+        ParametersBatch b({&rank});
+
+        auto generator = PARAMETRIC_D() {
+            auto rank = p.getIntParam("rank");
+            std::vector<Nd4jLong> shapeA;
+            std::vector<Nd4jLong> shapeB;
+            auto ctx = new Context(1);
+
+            if(rank == 3){
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {32, 1024, 1024}), true);
+            } else {
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {4, 8, 1024, 1024}), true);
+            }
+
+            return ctx;
+        };
+
+        nd4j::ops::matmul mmul;
+        DeclarableBenchmark benchmark(mmul, "mmul (batch)");
+        output += helper.runOperationSuit(&benchmark, generator, b, "MMul (batch)");
+
+        return output;
+    }
+
+    static std::string gemmRegularBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        for (int o = 0; o <= 1; o++) {
+            char resultOrder = (o == 0 ? 'f' : 'c');
+            for (int tA = 0; tA <= 1; tA++) {
+                for (int tB = 0; tB <= 1; tB++) {
+
+
+                    IntPowerParameters pa("sz", 2, 7, gemmRegularUpperPow, 2);          //2^7=128, 2^9=512, 2^11=2048
+
+                    ParametersBatch b({&pa});
+
+                    auto generator = PARAMETRIC_XYZ() {
+                        auto s = p.getIntParam("sz");
+                        auto A = NDArrayFactory::create_<float>('c', {s, s});
+                        auto B = NDArrayFactory::create_<float>('c', {s, s});
+                        auto C = NDArrayFactory::create_<float>(resultOrder, {s, s});
+
+                        x.push_back(A);
+                        y.push_back(B);
+                        z.push_back(C);
+                    };
+
+                    std::string n;
+                    n += "Gemm - tA=";
+                    n += std::to_string(tA);
+                    n += ", tB=";
+                    n += std::to_string(tB);
+                    n += ", cOrder=";
+                    n += resultOrder;
+
+                    MatrixBenchmark mb(1.0, 0.0, tA == 0 ? false : true, tB == 0 ? false : true, n);
+
+                    output += helper.runOperationSuit(&mb, generator, b, n.c_str());
+                }
+            }
+        }
+
+        return output;
+    }
+
+    static std::string scatterOpBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4);      //2^10 to 2^26 in steps of 4
+        ParametersBatch batch({&length});
+
+        //Gather 1D tests - 1d ref, 1d indices, 1d updates -> 1d output
+        nd4j::ops::scatter_upd scatter_update1;
+        DeclarableBenchmark sa1d(scatter_update1, "scatter_update1d");
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int length = p.getIntParam("length");
+            auto in = NDArrayFactory::create_<float>('c', {length});
+            auto indices = NDArrayFactory::create_<int>('c', {length});
+            auto updates = NDArrayFactory::create_<float>('c', {length});
+
+            int* a = new int[length];
+            for( int i=0; i<length; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + length-1));
+            for( int i=0; i<length; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setInputArray(2, updates, true);
+            ctx->setOutputArray(0, in);         //Needs to be inplace to avoid copy!
+            ctx->markInplace(true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&sa1d, generator, batch, "Scatter Update - 1d");
+
+        //Gather 2D tests - 2d input, 1d indices, 2d updates -> 2d output
+        IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4);      //2^10 to 2^16 in steps of 2: 2^10, ..., 2^20
+        PredefinedParameters cols("cols", {32});
+        ParametersBatch batch2({&rows, &cols});
+        nd4j::ops::scatter_upd scatter_update2;
+        DeclarableBenchmark sa2d(scatter_update2, "scatter_update2d");
+        auto generator2 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int rows = p.getIntParam("rows");
+            int cols = p.getIntParam("cols");
+            auto in = NDArrayFactory::create_<float>('c', {rows, cols});
+            auto indices = NDArrayFactory::create_<int>('c', {rows});
+            auto updates = NDArrayFactory::create_<float>('c', {rows, cols});
+
+            int* a = new int[rows];
+            for( int i=0; i<rows; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + rows-1));
+            for( int i=0; i<rows; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setInputArray(2, updates, true);
+            ctx->setOutputArray(0, in);         //Needs to be inplace to avoid copy!
+            ctx->markInplace(true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&sa2d, generator2, batch2, "Scatter Update - 2d");
+
+        //Gather 3D tests - 3d input, 1d indices -> 3d output
+        IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);
+        PredefinedParameters sz1("sz1", {32});
+        ParametersBatch batch3({&sz0, &sz1});
+        nd4j::ops::scatter_upd scatter_update3;
+        DeclarableBenchmark sa3d(scatter_update3, "scatter3d");
+        auto generator3 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int sz0 = p.getIntParam("sz0");
+            int sz1 = p.getIntParam("sz1");
+            auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
+            auto indices = NDArrayFactory::create_<int>('c', {sz0});
+            auto updates = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
+
+            int* a = new int[sz0];
+            for( int i=0; i<sz0; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + sz0-1));
+            for( int i=0; i<sz0; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setInputArray(2, updates, true);
+            ctx->setOutputArray(0, in);         //Needs to be inplace to avoid copy!
+            ctx->markInplace(true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&sa3d, generator3, batch3, "Scatter Update - 3d");
+        return output;
+    }
+
+    static std::string gatherOpBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 10, gatherOpPowLimit, 4);      //2^10 to 2^22 in steps of 4
+        ParametersBatch batch({&length});
+
+        //Gather 1D tests - 1d input, 1d indices -> 1d output
+        nd4j::ops::gather gather1;
+        DeclarableBenchmark gather1d(gather1, "gather1d");
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int length = p.getIntParam("length");
+            auto in = NDArrayFactory::create_<float>('c', {length});
+            auto indices = NDArrayFactory::create_<int>('c', {length});
+            int* a = new int[length];
+            for( int i=0; i<length; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + length-1));
+            for( int i=0; i<length; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {length}), true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&gather1d, generator, batch, "Gather - 1d");
+
+        //Gather 2D tests - 2d input, 1d indices -> 2d output
+        IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4);      //2^10 to 2^20 in steps of 2: 2^10, ..., 2^20
+        PredefinedParameters cols("cols", {32});
+        ParametersBatch batch2({&rows, &cols});
+        nd4j::ops::gather gather2;
+        DeclarableBenchmark gather2d(gather2, "gather2d");
+        auto generator2 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int rows = p.getIntParam("rows");
+            int cols = p.getIntParam("cols");
+            auto in = NDArrayFactory::create_<float>('c', {rows, cols});
+            auto indices = NDArrayFactory::create_<int>('c', {rows});
+
+            int* a = new int[rows];
+            for( int i=0; i<rows; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + rows-1));
+            for( int i=0; i<rows; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, cols}), true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&gather2d, generator2, batch2, "Gather - 2d");
+
+        //Gather 3D tests - 3d input, 1d indices -> 3d output
+        IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);      //2^8 to 2^16 in steps of 4
+        PredefinedParameters sz1("sz1", {32});
+        ParametersBatch batch3({&sz0, &sz1});
+        nd4j::ops::gather gather3;
+        DeclarableBenchmark gather3d(gather3, "gather3d");
+        auto generator3 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int sz0 = p.getIntParam("sz0");
+            int sz1 = p.getIntParam("sz1");
+            auto in = NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1});
+            auto indices = NDArrayFactory::create_<int>('c', {sz0});
+
+            int* a = new int[sz0];
+            for( int i=0; i<sz0; i++ ){
+                a[i] = i;
+            }
+            srand(12345);
+            std::random_shuffle(a, (a + sz0-1));
+            for( int i=0; i<sz0; i++ ){
+                indices->p(i, a[i]);
+            }
+            delete[] a;
+
+            ctx->setInputArray(0, in, true);
+            ctx->setInputArray(1, indices, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {sz0, sz1, 512/sz1}), true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&gather3d, generator3, batch3, "Gather - 3d");
+
+        return output;
+    }
+
+    static std::string mismatchedOrdersAssignBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters rows("rows", 2, 2, mismatchedAssignPowLimit, 4);      //2^2 to 2^26 in steps of 2 - 2^1=2, ..., 2^26=67108864
+        BoolParameters cf("cf");
+
+        ParametersBatch batch({&rows, &cf});
+
+        auto generator = PARAMETRIC_XZ() {
+            int numElements = 67108864;    //2^26
+            int rows = p.getIntParam("rows");
+            int cols = numElements / rows;
+            bool c = p.getIntParam("cf");
+
+            auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
+            auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
+            x.push_back(arr);
+            z.push_back(arr2);
+        };
+
+        TransformBenchmark tb(transform::AnyOps::Assign, "assign");
+        output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign");
+
+        //Also test: NCHW to NHWC and back
+        BoolParameters nchw("nchw");
+        ParametersBatch batch2({&nchw});
+        auto generator2 = PARAMETRIC_XZ() {
+            bool nchw = p.getIntParam("nchw");
+
+            if(nchw) {
+                auto orig = NDArrayFactory::create_<float>('c', {16, 32, 64, 64});
+                orig->permutei({0,2,3,1});
+                x.push_back(orig);
+                z.push_back(NDArrayFactory::create_<float>('c', {16, 64, 64, 32}));
+            } else {
+                auto orig = NDArrayFactory::create_<float>('c', {16, 64, 64, 32});
+                orig->permutei({0,3,1,2});
+                x.push_back(orig);
+                z.push_back(NDArrayFactory::create_<float>('c', {16, 32, 64, 64}));
+            }
+        };
+
+        TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
+        output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign");
+        return output;
+    }
+
+    static std::string broadcastOpsMatrixBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Broadcast ops: matrices for rank 3, 4, 5
+        for( int rank=3; rank <= broadcastMatrixRankLimit; rank++ ){
+            int numAxisTests = -1;
+            if(rank == 3){
+                numAxisTests = 3;
+            } else if(rank == 4){
+                numAxisTests = 6;
+            } else if(rank == 5){
+                numAxisTests = 10;
+            }
+
+            IntParameters testNum("testNum", 0,numAxisTests-1,1);
+            ParametersBatch b({&testNum});
+
+            auto generator = PARAMETRIC_D(){
+                int n = p.getIntParam("testNum");
+                std::vector<int> axis({});
+                switch(n){
+                    //rank 3+
+                    case 0:
+                        axis = std::vector<int>({0,1});
+                        break;
+                    case 1:
+                        axis = std::vector<int>({0,2});
+                        break;
+                    case 2:
+                        axis = std::vector<int>({1,2});
+                        break;
+                        //rank 4+
+                    case 3:
+                        axis = std::vector<int>({0,3});
+                        break;
+                    case 4:
+                        axis = std::vector<int>({1,3});
+                        break;
+                    case 5:
+                        axis = std::vector<int>({2,3});
+                        break;
+                        //Rank 5
+                    case 6:
+                        axis = std::vector<int>({0,4});
+                        break;
+                    case 7:
+                        axis = std::vector<int>({1,4});
+                        break;
+                    case 8:
+                        axis = std::vector<int>({2,4});
+                        break;
+                    case 9:
+                        axis = std::vector<int>({3,4});
+                        break;
+                }
+
+
+                std::vector<Nd4jLong> shape({});
+                std::vector<Nd4jLong> toBcShape({});
+                int vectorLength;
+                if(rank == 3){
+                    shape = std::vector<Nd4jLong>({64,64,64});
+                    toBcShape = std::vector<Nd4jLong>({64,64,64});
+                    vectorLength = 64;
+                } else if(rank == 4){
+                    shape = std::vector<Nd4jLong>({32,32,32,32});
+                    toBcShape = std::vector<Nd4jLong>({32,32,32,32});
+                    vectorLength = 32;
+                } else if(rank == 5){
+                    shape = std::vector<Nd4jLong>({16,16,16,16,16});
+                    toBcShape = std::vector<Nd4jLong>({16,16,16,16,16});
+                    vectorLength = 16;
+                }
+
+                for( int i=0; i<rank; i++ ){
+                    if(axis[0] == i || axis[1] == i){
+                        continue;
+                    }
+                    toBcShape[i] = 1;
+                }
+
+                auto ctx = new Context(1);
+                ctx->setInputArray(0, NDArrayFactory::create_<float>('c', shape), true);
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', toBcShape), true);
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', shape), true);
+                return ctx;
+            };
+
+            std::string name;
+            name += "Broadcast Matrix Add (Custom) - Rank";
+            name += std::to_string(rank);
+
+            nd4j::ops::add op;
+            DeclarableBenchmark benchmark(op, "add");
+            output += helper.runOperationSuit(&benchmark, generator, b, name.c_str());
+        }
+
+        return output;
+    }
+
+
+    static std::string broadcast2dBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        PredefinedParameters rows("rows", {65536});
+        IntPowerParameters cols("cols", 2, 2, limit10, 4);      //2^2, 2^6, 2^10
+        BoolParameters axis("axis");
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&rows, &cols, &axis, &inplace});
+
+        auto generator = PARAMETRIC_D() {
+            auto a = p.getIntParam("axis");
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
+
+            auto ctx = new Context(1);
+            ctx->setInputArray(0, arr, true);
+            if(a == 0){
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), 1}), true);
+            } else {
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
+            }
+            if (p.getIntParam("inplace") == 1) {
+                ctx->setOutputArray(0, arr);
+                ctx->markInplace(true);
+            } else {
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}), true);
+            }
+            return ctx;
+        };
+
+        std::string s("add");
+        nd4j::ops::add op;
+        DeclarableBenchmark benchmark(op, "add");
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
+        return output;
+    }
+
+    static std::string broadcastBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        //Broadcast ops: vectors for rank 2, 3, 4, 5
+        for( int axis=0; axis<=1; axis++ ){
+            PredefinedParameters rows("rows", {65536});
+            IntPowerParameters cols("cols", 2, 2, limit10, 4);      //2^1 to 2^10 in steps of 2 - 2^1=2, ..., 2^10=1024
+            BoolParameters inplace("inplace");
+
+            ParametersBatch batch({&rows, &cols, &inplace});
+
+            auto generator = PARAMETRIC_XYZ() {
+                auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
+                x.push_back(arr);
+                if(axis == 0){
+                    y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows")}));
+                } else {
+                    y.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("cols")}));
+                }
+                if (p.getIntParam("inplace") == 1) {
+                    z.push_back(arr);
+                } else {
+                    z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
+                }
+            };
+
+            std::string s("bAdd"); s += std::to_string(axis); s += "r2";
+            BroadcastBenchmark bAdd(broadcast::Add, s, {axis});
+            output += helper.runOperationSuit(&bAdd, generator, batch, "Broadcast Add - Rank 2");
+        }
+
+        for( int rank=3; rank<=5; rank++ ){
+            for( int axis=1; axis<rank; axis++ ){
+                std::vector<Nd4jLong> shape({});
+                int vectorLength;
+                if(rank == 3){
+                    shape = std::vector<Nd4jLong>({32,128,128});
+                    vectorLength = 128;
+                } else if(rank == 4){
+                    shape = std::vector<Nd4jLong>({16,64,64,64});
+                    vectorLength = 64;
+                } else if(rank == 5){
+                    shape = std::vector<Nd4jLong>({16,48,48,48,48});
+                    vectorLength = 48;
+                }
+
+                ParametersBatch batch({});
+
+                //Note: always inplace here
+                auto generator = PARAMETRIC_XYZ() {
+                    auto arr = NDArrayFactory::create_<float>('c', shape);
+                    x.push_back(arr);
+                    y.push_back(NDArrayFactory::create_<float>('c', {vectorLength}));
+                    z.push_back(arr);
+                };
+
+                std::string name("bArr-r"); name += std::to_string(rank); name += "a"; name += std::to_string(axis);
+                BroadcastBenchmark bAdd(broadcast::Add, name, {axis});
+                std::string n2("Broadcast Add - Rank"); n2 += std::to_string(rank); n2 += " - axis="; n2 += std::to_string(axis);
+                output += helper.runOperationSuit(&bAdd, generator, batch, n2.c_str());
+            }
+        }
+
+        return output;
+    }
+
+    static std::string fastStridedReductionNonEws() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters stride("stride", 2, 0, 10, 2);          //2^0=1, ..., 2^10=1024
+
+        ParametersBatch batch({&stride});
+
+        //This is an edge case: technically an EWS *should* be available here
+        auto generator1 = PARAMETRIC_XYZ() {
+            auto stride = p.getIntParam("stride");
+            auto arr = NDArrayFactory::create_<float>('c', {131072 + (stride == 1 ? 0 : 1), stride});
+
+            NDArray* strided;
+            if(stride == 1){
+                strided = arr;
+            } else {
+                IndicesList indices({NDIndex::interval(0,131072), NDIndex::interval(0,1)});
+                strided = arr->subarray(indices);        //All rows, first column
+                delete arr;
+            }
+
+            strided->assign(1.0);
+            x.push_back(strided);
+            y.push_back(nullptr);
+            z.push_back(NDArrayFactory::create_<float>(0.0f));
+        };
+
+        ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
+        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator1), batch, "Strided Sum - No EWS Test 1");
+
+
+        //No EWS defined for this case
+        auto generator2 = PARAMETRIC_XYZ() {
+            auto stride = p.getIntParam("stride");
+            auto arr = NDArrayFactory::create_<float>('c', {(stride == 1 ? 1 : 2) * 1024, 1024, stride});
+
+            NDArray* strided;
+            if(stride == 1){
+                strided = arr;
+            } else {
+                IndicesList indices({NDIndex::interval(0,2*1024,2), NDIndex::all(), NDIndex::interval(0,1)});
+                strided = arr->subarray(indices);
+                delete arr;
+            }
+
+            strided->assign(1.0);
+            x.push_back(strided);
+            y.push_back(nullptr);
+            z.push_back(NDArrayFactory::create_<float>(0.0f));
+        };
+
+        ReductionBenchmark rbSum2(reduce::SameOps::Sum, "stridedSumNoEWS");
+        output += helper.runOperationSuit(&rbSum2, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator2), batch, "Strided Sum - No EWS Test 2");
+
+        return output;
+    }
+
+    static std::string fastStridedReductionIrregular() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4);      //2^12 to 2^20 in steps of 4
+        PredefinedParameters stride("stride", {26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+                                               122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
+                                               1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028});
+
+        ParametersBatch batch({&length, &stride});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto stride = p.getIntParam("stride");
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
+
+            NDArray* strided;
+            if(stride == 1){
+                strided = arr;
+            } else {
+                IndicesList indices({NDIndex::all(), NDIndex::interval(0,1)});
+                strided = arr->subarray(indices);        //All rows, first column
+                delete arr;
+            }
+
+            strided->assign(1.0);
+            x.push_back(strided);
+            y.push_back(nullptr);
+            z.push_back(NDArrayFactory::create_<float>(0.0f));
+        };
+
+        ReductionBenchmark rbSum(reduce::SameOps::Sum, "stridedSum");
+
+        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Irregular Strides");
+
+        return output;
+    }
+
+    static std::string fastStridedReductionsRegular() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 12, stridedReductionPowLimit, 4);      //2^12 to 2^20 in steps of 4
+        IntPowerParameters stride("stride", 2, 0, 10);          //2^0=1, ..., 2^10=1024
+
+        ParametersBatch batch({&length, &stride});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto stride = p.getIntParam("stride");
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
+
+            NDArray* strided;
+            if(stride == 1){
+                strided = arr;
+            } else {
+                IndicesList indices({NDIndex::all(), NDIndex::point(0)});
+                strided = arr->subarray(indices);        //All rows, first column
+                delete arr;
+            }
+
+            strided->assign(1.0);
+            x.push_back(strided);
+            y.push_back(nullptr);
+//            z.push_back(NDArrayFactory::create_<float>(0.0f));
+            z.push_back(NDArrayFactory::create_<float>('c', {1}));
+        };
+
+        ReductionBenchmark rbSum(reduce::SameOps::Sum, "Strided Sum");
+
+        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Strided Sum - Regular Strides (powers of 2)");
+
+        auto generator3 = PARAMETRIC_D(){
+            auto ctx = new Context(1);
+            auto stride = p.getIntParam("stride");
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length"), stride});
+
+            NDArray* strided;
+            if(stride == 1){
+                strided = arr;
+            } else {
+                IndicesList indices({NDIndex::all(), NDIndex::point(0)});
+                strided = arr->subarray(indices);        //All rows, first column
+                delete arr;
+            }
+
+            strided->assign(1.0);
+            ctx->setInputArray(0, strided, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>('c', {1}), true);
+            auto iargs = new Nd4jLong[1];
+            iargs[0] = 0;
+            ctx->setIArguments(iargs, 1);
+            delete[] iargs;
+            return ctx;
+        };
+
+        nd4j::ops::argmax opArgmax;
+        DeclarableBenchmark dbArgmax(opArgmax, "stridedArgmax");
+        output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Strided Argmax");
+        return output;
+    }
+
+    static std::string fastReduceAlongDimBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        int length[] = {1024*1024, 64*1024*1024};
+        int powLimit[] = {10, 20, 26};
+        int powStep[] = {2, 2, 4};
+
+        for( int i=0; i < limit3; i++ ){
+            IntPowerParameters rows("rows", 2, 0, powLimit[i], powStep[i]);
+            BoolParameters dim("dim");
+
+
+            ParametersBatch batch({&rows, &dim});
+
+            auto generator = PARAMETRIC_XYZ() {
+                int rows = p.getIntParam("rows");
+                int cols = length[i] / rows;
+                int dim = p.getIntParam("dim");
+                auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
+
+
+                x.push_back(arr);
+                y.push_back(NDArrayFactory::create_<Nd4jLong>(dim));
+
+                NDArray* result;
+                if(dim == 0){
+                    result = NDArrayFactory::create_<float>('c', {cols});
+                } else {
+                    result = NDArrayFactory::create_<float>('c', {rows});
+                }
+                z.push_back(result);
+            };
+
+            ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
+            ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
+
+            std::string s1("Sum Along Dimension - ");
+            s1 += std::to_string(length[i]);
+
+            output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
+
+
+            auto generator3 = PARAMETRIC_D(){
+                auto ctx = new Context(1);
+                int rows = p.getIntParam("rows");
+                int cols = length[i] / rows;
+                int dim = p.getIntParam("dim");
+                auto arr = NDArrayFactory::create_<float>('c', {rows, cols});
+
+                Nd4jLong* dimArg = new Nd4jLong[1];
+                dimArg[0] = dim;
+                ctx->setIArguments(dimArg, 1);
+                delete[] dimArg;
+
+                ctx->setInputArray(0, arr, true);
+
+                NDArray* result;
+                if(dim == 0){
+                    result = NDArrayFactory::create_<Nd4jLong>('c', {cols});
+                } else {
+                    result = NDArrayFactory::create_<Nd4jLong>('c', {rows});
+                }
+                ctx->setOutputArray(0, result, true);
+                return ctx;
+            };
+
+            std::string s5("Argmax Along Dimension - ");
+            s5 += std::to_string(length[i]);
+
+            nd4j::ops::argmax opArgmax;
+            DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
+            output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
+        }
+
+        return output;
+    }
+
+    static std::string fastReduceToScalarBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 10, reduceScalarPowLimit, 4);      //2^10 to 2^26 in steps of 4
+
+        ParametersBatch batch({&length});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+
+            x.push_back(arr);
+            y.push_back(nullptr);
+            z.push_back(NDArrayFactory::create_<float>(0.0f));
+        };
+
+        ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
+
+        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
+
+        //Index reduction
+        nd4j::ops::argmax opArgmax;
+        DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
+        auto generator3 = PARAMETRIC_D(){
+            auto ctx = new Context(1);
+
+            ctx->setInputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
+            ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong>((Nd4jLong)0), true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>(0), true);
+
+            return ctx;
+        };
+        output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
+
+        return output;
+    }
+
+    static std::string fastNonEwsTransformBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+        IntPowerParameters rowcol("rowcol", 2, 2, nonEwsPowLimit, 4);      //2^2 to 2^14 in steps of 4 -> non-inplace case: 2x 2^10 x 2^10 = 128mb
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&rowcol, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            int r = p.getIntParam("rowcol");
+            auto arr = NDArrayFactory::create_<float>('c', {r, r+1});
+            IndicesList indices({NDIndex::all(), NDIndex::interval(0,r-1)});
+            auto view = arr->subarray(indices);
+            //nd4j_printf("VIEW ARRAY: rows=%lld, columns=%lld", view->sizeAt(0), view->sizeAt(1));
+            x.push_back(view);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(view);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {r,r}));
+            }
+            delete arr;
+        };
+
+        ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU_View");
+        sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
+
+        TransformBenchmark tbExp(transform::StrictOps::Exp, "exp view");
+
+        output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU View");
+        output += helper.runOperationSuit(&tbExp, generator, batch, "Exp View");
+
+        return output;
+    }
+
+    static std::string fastPairwiseBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+        IntPowerParameters length("length", 2, 10, pairwisePowLimit, 4);      //2^10 to 2^26 in steps of 4 -> max is 512mb
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto arr1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            auto arr2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            x.push_back(arr1);
+            y.push_back(arr2);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr1);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
+        output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");
+
+        PairwiseBenchmark pb2(pairwise::Ops::Add, "Multiply");
+        output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Multiply");
+
+        return output;
+    }
+
+    static std::string heavyTransformsBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+        IntPowerParameters length("length", 2, 10, heavyPowLimit, 4);      //2^10 to 2^22, steps of 4
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if (p.getIntParam("inplace") == 1) {
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        //Ops to test: erf (transform), betainc (custom), polygamma, synthetic ops?
+        TransformBenchmark erf(transform::StrictOps::Erf, "Erf");
+        output += helper.runOperationSuit(&erf, generator, batch, "Error Function (Erf)");
+
+        ParametersBatch batch2({&length});
+        nd4j::ops::polygamma op1;
+        DeclarableBenchmark pg(op1, "polygamma");
+        auto generator2 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            in0->assign(0.25);
+            auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            in1->assign(0.5);
+            ctx->setInputArray(0, in0, true);
+            ctx->setInputArray(1, in1, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
+            return ctx;
+        };
+
+
+        IntPowerParameters lengthBetaInc("length", 2, 10, heavyPowLimit, 4);      //2^10 to 2^22 in steps of 4
+        ParametersBatch batch3({&lengthBetaInc});
+        nd4j::ops::betainc op2;
+        DeclarableBenchmark binc(op2, "betainc");
+        auto generator3 = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            auto in0 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            in0->assign(0.25);
+            auto in1 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            in1->assign(0.5);
+            auto in2 = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            in2->assign(0.75);
+            ctx->setInputArray(0, in0, true);
+            ctx->setInputArray(1, in1, true);
+            ctx->setInputArray(2, in2, true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {p.getIntParam("length")}), true);
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&pg, generator2, batch2, "PolyGamma Function");
+        output += helper.runOperationSuit(&binc, generator3, batch3, "Incomplete Beta Function (BetaInc)");
+
+        return output;
+    }
+
+    static std::string intermediateTransformsBenchmark() {
+        std::string output;
+
+        //Non-inplace: 2x 2^26 elements FP32 -> 512MB
+        BenchmarkHelper helper(wIterations, rIterations);
+        IntPowerParameters length("length", 2, 10, intermediateTransformPowLimit, 4);      //2^20 to 2^22 in steps of 4
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        TransformBenchmark tbTanh(transform::StrictOps::Tanh, "tanh");
+        TransformBenchmark tbGelu(transform::StrictOps::GELU, "gelu");
+
+        output += helper.runOperationSuit(&tbTanh, generator, batch, "Tanh");
+        output += helper.runOperationSuit(&tbGelu, generator, batch, "gelu");
+
+
+        //2x 1024 cols x 2^18 = 2GB
+        IntPowerParameters rows("rows", 2, 10, intermediateTransformPowLimit2, 4);
+        PredefinedParameters cols("cols", {4, 128, 1024});
+
+        ParametersBatch batch2({&rows, &cols, &inplace});
+
+        auto generator2 = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("rows"), p.getIntParam("cols")}));
+            }
+        };
+
+        TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");
+
+        output += helper.runOperationSuit(&tbSoftmax, generator2, batch2, "Softmax");
+
+        return output;
+    }
+
+    static std::string fastTransformsBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+        IntPowerParameters length("length", 2, 10, transformBenchmarkPowLimit, 4);      //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        ScalarBenchmark sbLRelu(scalar::Ops::LeakyRELU, "LeakyRELU");
+        sbLRelu.setY(NDArrayFactory::create_<float>(0.0));
+
+        TransformBenchmark tbAbs(transform::SameOps::Abs, "abs");
+        TransformBenchmark tbExp(transform::StrictOps::Exp, "exp");
+
+        output += helper.runOperationSuit(&sbLRelu, generator, batch, "LeakyRELU");
+        output += helper.runOperationSuit(&tbAbs, generator, batch, "Abs");
+        output += helper.runOperationSuit(&tbExp, generator, batch, "Exp");
+
+        return output;
+    }
+
+    static std::string fastScalarBenchmark() {
+        std::string output;
+        BenchmarkHelper helper(wIterations, rIterations);
+
+        IntPowerParameters length("length", 2, 10, scalarBenchmarkPowLimit, 4);      //2^10 to 2^30 in steps of 4 - 2^10, 2^14, ..., 2^26
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<float>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<float>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
+        ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
+        ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");
+
+
+        sbAdd.setY(NDArrayFactory::create_<float>(3.14159265359));
+        sbDiv.setY(NDArrayFactory::create_<float>(3.14159265359));
+        sbPow.setY(NDArrayFactory::create_<float>(3.14159265359));
+
+
+        output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359) - F32");
+        output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359) - F32");
+        output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359) - F32");
+
+        return output;
+    }
+
+
+    static long nowMs(){
+        auto s = std::chrono::system_clock::now().time_since_epoch();
+        auto v = std::chrono::duration_cast<std::chrono::milliseconds>(s).count();
+        return v;
+    }
+
+    static long duration(long start){
+        return nowMs() - start;
+    }
+
+    static long done(long start){
+        long dur = duration(start);
+        nd4j_printf("Done: %i ms\n", dur);
+        return nowMs();
+    }
+
+
+    std::string FullBenchmarkSuit::runSuit() {
+        std::string result;
+
+        long start = nowMs();
+        
+        // set 1
+        nd4j_printf("Running FullBenchmarkSuite.fastScalarBenchmark\n", "");
+        result += fastScalarBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastTransformsBenchmark\n", "");
+        result += fastTransformsBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.intermediateTransformsBenchmark\n", "");
+        result += intermediateTransformsBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastPairwiseBenchmark\n", "");
+        result += fastPairwiseBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.heavyTransformsBenchmark\n", "");
+        result += heavyTransformsBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastNonEwsTransformBenchmark\n", "");
+        result += fastNonEwsTransformBenchmark();
+        start = done(start);
+
+        // set 2
+        nd4j_printf("Running FullBenchmarkSuite.fastReduceToScalarBenchmark\n", "");
+        result += fastReduceToScalarBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastReduceAlongDimBenchmark\n", "");
+        result += fastReduceAlongDimBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionsRegular\n", "");
+        result += fastStridedReductionsRegular();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionIrregular\n", "");
+        result += fastStridedReductionIrregular();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.fastStridedReductionNonEws\n", "");
+        result += fastStridedReductionNonEws();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.broadcastBenchmark\n", "");
+        result += broadcastBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.broadcast2dBenchmark\n", "");
+        result += broadcast2dBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.broadcastOpsMatrixBenchmark\n", "");
+        result += broadcastOpsMatrixBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.mismatchedOrdersAssignBenchmark\n", "");
+        result += mismatchedOrdersAssignBenchmark();
+        start = done(start);
+
+
+        // set 3
+        nd4j_printf("Running FullBenchmarkSuite.gatherOpBenchmark\n", "");
+        result += gatherOpBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.scatterOpBenchmark\n", "");
+        result += scatterOpBenchmark();
+        start = done(start);
+
+        // set 4
+        nd4j_printf("Running FullBenchmarkSuite.gemmRegularBenchmark\n", "");
+        result += gemmRegularBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.gemmIrregularBenchmark\n", "");
+        result += gemmIrregularBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.rngBenchmark\n", "");
+        result += rngBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.conv2dBenchmark\n", "");
+        result += conv2dBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.pool2dBenchmark\n", "");
+        result += pool2dBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.batchnormBenchmark\n", "");
+        result += batchnormBenchmark();
+        start = done(start);
+
+        nd4j_printf("Running FullBenchmarkSuite.lstmBenchmark\n", "");
+        result += lstmBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.conv3dBenchmark\n", "");
+        result += conv3dBenchmark();
+        start = done(start);
+        nd4j_printf("Running FullBenchmarkSuite.maxPool3DBenchmark\n", "");
+        result += maxPool3DBenchmark();
+        start = done(start);
+//        nd4j_printf("Running FullBenchmarkSuite.layerNormBenchmark\n", "");
+//        result += layerNormBenchmark();
+//        start = done(start);
+
+        return result;
+    }
+
+
+}
\ No newline at end of file
diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
new file mode 100644
index 000000000..ae9db9b6c
--- /dev/null
+++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
@@ -0,0 +1,639 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <ops/declarable/CustomOperations.h>
+#include "performance/benchmarking/LightBenchmarkSuit.h"
+
+#ifdef _RELEASE
+#define WARMUP 3
+#define NUM_ITER 10
+
+#else
+
+#define WARMUP 0
+#define NUM_ITER 1
+
+#endif
+
+namespace nd4j {
+
+    template <typename T>
+    static std::string transformBenchmark() {
+        std::string output;
+        output += "transformBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20 - 4MB
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        ScalarBenchmark sbRelu(scalar::Ops::RELU, "RELU");
+        sbRelu.setY(NDArrayFactory::create_<T>(0.0));
+
+        TransformBenchmark tbSigmoid(transform::StrictOps::Sigmoid, "sigmoid");
+        TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");
+
+        output += helper.runOperationSuit(&sbRelu, generator, batch, "RELU");
+        output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Sigmoid");
+        output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Softmax");
+
+        return output;
+    }
+
+    template <typename T>
+    static std::string scalarBenchmark() {
+        std::string output;
+        output += "scalarBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XZ() {
+            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
+            arr->assign(1.0);
+            x.push_back(arr);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr);
+            } else {
+                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
+        ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
+        ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");
+
+
+        sbAdd.setY(NDArrayFactory::create_<T>(3.14159265359));
+        sbDiv.setY(NDArrayFactory::create_<T>(3.14159265359));
+        sbPow.setY(NDArrayFactory::create_<T>(3.14159265359));
+
+
+        output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359)");
+        output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359)");
+        output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359)");
+
+        return output;
+    }
+
+
+    template <typename T>
+    static std::string pairwiseBenchmark() {
+        std::string output;
+        output += "pairwiseBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+        IntPowerParameters length("length", 2, 8, 20, 4);      //2^4 to 2^20 in steps of 4 - 2^4, 2^8, 2^16, 2^20
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&length, &inplace});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto arr1 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
+            auto arr2 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
+            x.push_back(arr1);
+            y.push_back(arr2);
+            if(p.getIntParam("inplace") == 1){
+                z.push_back(arr1);
+            } else {
+                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
+            }
+        };
+
+        PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
+        output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");
+
+        PairwiseBenchmark pb2(pairwise::Ops::Divide, "Divide");
+        output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Divide");
+
+        return output;
+    }
+
+    static std::string mismatchedOrderAssign() {
+        std::string output;
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        IntPowerParameters rows("rows", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20
+        BoolParameters cf("cf");
+
+        ParametersBatch batch({&rows, &cf});
+
+        auto generator = PARAMETRIC_XZ() {
+            int numElements = 4194304;    //2^24
+            int rows = p.getIntParam("rows");
+            int cols = numElements / rows;
+            bool c = p.getIntParam("cf");
+
+            auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
+            auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
+            x.push_back(arr);
+            z.push_back(arr2);
+        };
+
+        TransformBenchmark tb(transform::AnyOps::Assign, "assign");
+        output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign F32");
+
+        //Also test: NCHW to NHWC and back
+        BoolParameters nchw("nchw");
+        int mb = 8;
+        int hw = 64;
+        int c = 3;
+        ParametersBatch batch2({&nchw});
+        auto generator2 = PARAMETRIC_XZ() {
+            bool nchw = p.getIntParam("nchw");
+
+            if(nchw) {
+                auto orig = NDArrayFactory::create_<float>('c', {mb, c, hw, hw});
+                orig->permutei({0,2,3,1});
+                x.push_back(orig);
+                z.push_back(NDArrayFactory::create_<float>('c', {mb, hw, hw, c}));
+            } else {
+                auto orig = NDArrayFactory::create_<float>('c', {mb, hw, hw, c});
+                orig->permutei({0,3,1,2});
+                x.push_back(orig);
+                z.push_back(NDArrayFactory::create_<float>('c', {mb, c, hw, hw}));
+            }
+        };
+
+        TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
+        output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign FP32");
+        return output;
+    }
+
+    template <typename T>
+    static std::string gemmBenchmark() {
+        std::string output;
+        output += "gemm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        for (int o = 0; o <= 1; o++) {
+            char resultOrder = (o == 0 ? 'f' : 'c');
+            IntPowerParameters sz("sz", 2, 4, 10, 2);          //2^4=16, ..., 2^10=1024   ->  4 elements
+
+            ParametersBatch b({&sz});
+
+            auto generator = PARAMETRIC_XYZ() {
+                auto a = p.getIntParam("sz");
+                auto b = p.getIntParam("sz");
+                auto c = p.getIntParam("sz");
+                std::vector<Nd4jLong> shapeA;
+                std::vector<Nd4jLong> shapeB;
+                shapeA = {a, b};
+                shapeB = {b, c};
+                auto A = NDArrayFactory::create_<T>('c', shapeA);
+                auto B = NDArrayFactory::create_<T>('c', shapeB);
+                auto C = NDArrayFactory::create_<T>(resultOrder, {a, c});
+
+                x.push_back(A);
+                y.push_back(B);
+                z.push_back(C);
+            };
+
+            std::string n;
+            n += "Gemm - cOrder=";
+            n += resultOrder;
+
+            MatrixBenchmark mb(1.0, 0.0, false, false, n);
+
+            output += helper.runOperationSuit(&mb, generator, b, n.c_str());
+        }
+
+        return output;
+    }
+
+    template <typename T>
+    static std::string reduceFullBenchmark() {
+        std::string output;
+        output += "reduceFullBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20
+
+        ParametersBatch batch({&length});
+
+        auto generator = PARAMETRIC_XYZ() {
+            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
+
+            x.push_back(arr);
+            y.push_back(nullptr);
+            z.push_back(NDArrayFactory::create_<T>(0.0f));
+        };
+
+        ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
+        ReductionBenchmark rbProd(reduce::SameOps::Prod, "prod");
+        ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
+
+        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
+        output += helper.runOperationSuit(&rbProd, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Product - Full Array Reduction");
+        output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Maximum - Full Array Reduction");
+
+        //Index reduction
+        nd4j::ops::argmax opArgmax;
+        DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
+        auto generator3 = PARAMETRIC_D(){
+            auto ctx = new Context(1);
+
+            ctx->setInputArray(0, NDArrayFactory::create_<T>('c', {p.getIntParam("length")}), true);
+            ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong>((Nd4jLong)0), true);
+            ctx->setOutputArray(0, NDArrayFactory::create_<Nd4jLong>(0), true);
+
+            return ctx;
+        };
+        output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
+        return output;
+    }
+
+    template <typename T>
+    static std::string reduceDimBenchmark(){
+        std::string output;
+        output += "reduceDimBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        int length[] = {1024*1024};
+        int pow[] = {10};
+
+        for( int i=0; i<1; i++ ){
+            IntPowerParameters rows("rows", 2, 0, pow[i], 2);
+            BoolParameters dim("dim");
+
+
+            ParametersBatch batch({&rows, &dim});
+
+            auto generator = PARAMETRIC_XYZ() {
+                int rows = p.getIntParam("rows");
+                int cols = length[i] / rows;
+                int dim = p.getIntParam("dim");
+                auto arr = NDArrayFactory::create_<T>('c', {rows, cols});
+
+
+                x.push_back(arr);
+                y.push_back(NDArrayFactory::create_<Nd4jLong>(dim));
+
+                NDArray* result;
+                if(dim == 0){
+                    result = NDArrayFactory::create_<T>('c', {cols});
+                } else {
+                    result = NDArrayFactory::create_<T>('c', {rows});
+                }
+                z.push_back(result);
+            };
+
+            ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
+            ReductionBenchmark rbMax(reduce::SameOps::Max, "max");
+
+            std::string s1("Sum Along Dimension - ");
+            s1 += std::to_string(length[i]);
+            std::string s3("Maximum Along Dimension - ");
+            s3 += std::to_string(length[i]);
+
+            output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
+            output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s3.c_str());
+
+
+
+            auto generator3 = PARAMETRIC_D(){
+                auto ctx = new Context(1);
+                int rows = p.getIntParam("rows");
+                int cols = length[i] / rows;
+                int dim = p.getIntParam("dim");
+                auto arr = NDArrayFactory::create_<T>('c', {rows, cols});
+
+                auto dimArg = new Nd4jLong[1];
+                dimArg[0] = dim;
+                ctx->setIArguments(dimArg, 1);
+                delete[] dimArg;
+
+                ctx->setInputArray(0, arr, true);
+
+                NDArray* result;
+                if(dim == 0){
+                    result = NDArrayFactory::create_<Nd4jLong>('c', {cols});
+                } else {
+                    result = NDArrayFactory::create_<Nd4jLong>('c', {rows});
+                }
+                ctx->setOutputArray(0, result, true);
+                return ctx;
+            };
+
+            std::string s5("Argmax Along Dimension - ");
+            s5 += std::to_string(length[i]);
+
+            nd4j::ops::argmax opArgmax;
+            DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
+            output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
+        }
+        return output;
+    }
+
+    template <typename T>
+    static std::string conv2d(){
+        std::string output;
+        output += "conv2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        //Convolution2D op
+        BoolParameters nhwc("nhwc");
+        PredefinedParameters k("k", {2, 3});
+
+        ParametersBatch batch({&nhwc, &k});
+        nd4j::ops::conv2d conv2d;
+        DeclarableBenchmark benchmark(conv2d, "conv2d");
+
+        int hw = 64;
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+            int khw = p.getIntParam("k");
+
+            if (n == 0) {
+                auto input = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
+                auto output = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            } else {
+                auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
+                auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            }
+
+            auto b = NDArrayFactory::create_<T>('c', {3});
+            auto w = NDArrayFactory::create_<T>('c', {khw, khw, 3, 3});   // [kH, kW, iC, oC] always
+
+            ctx->setInputArray(1, w, true);
+            ctx->setInputArray(2, b, true);
+
+            auto args = new Nd4jLong[10];
+            args[0] = args[1] = khw; //Kernel
+            args[2] = args[3] = 1;//Stride
+            args[4] = args[5] = 0;  //Pad
+            args[6] = args[7] = 1;  //Dilation
+            args[8] = 1;     //SAME
+            args[9] = n;//0-nchw, 1=nhwc
+            ctx->setIArguments(args, 10);
+            delete[] args;
+
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d");
+        return output;
+    }
+
+    template <typename T>
+    static std::string pool2d() {
+        std::string output;
+        output += "pool2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        //Convolution2D op
+        BoolParameters nhwc("nhwc");
+        PredefinedParameters k("k", {2, 3});
+
+        ParametersBatch batch({&nhwc, &k});
+
+        int c = 3;
+        int hw = 64;
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int n = p.getIntParam("nhwc");
+            int khw = p.getIntParam("k");
+
+            if (n == 0) {
+                auto input = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
+                auto output = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            } else {
+                auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
+                auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
+                ctx->setInputArray(0, input, true);
+                ctx->setOutputArray(0, output, true);
+            }
+
+            auto args = new Nd4jLong[11];
+            args[0] = args[1] = khw; //Kernel
+            args[2] = args[3] = 1;//Stride
+            args[4] = args[5] = 0;  //Pad
+            args[6] = args[7] = 1;  //Dilation
+            args[8] = 1;     //SAME
+            args[9] = 0;     //Divisor mode - 0 = exclude padding in divisor
+            args[10] = n;//0-nchw, 1=nhwc
+            ctx->setIArguments(args, 11);
+            delete[] args;
+
+            return ctx;
+        };
+
+        nd4j::ops::avgpool2d avgpool2d;
+        DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
+        output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pool 2d");
+
+        nd4j::ops::maxpool2d maxpool2d;
+        DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
+        output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pool 2d");
+        return output;
+    }
+
+    template <typename T>
+    static std::string lstmBenchmark() {
+        std::string output;
+        output += "lstm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        BoolParameters format("format");    //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
+        PredefinedParameters mb("mb", {1, 8});
+        int n = 128;
+
+        ParametersBatch batch({&format, &mb});
+        nd4j::ops::lstmBlock lstmBlock;
+        DeclarableBenchmark benchmark(lstmBlock, "lstm");
+
+        int seqLength = 8;
+
+        auto generator = PARAMETRIC_D() {
+            auto ctx = new Context(1);
+            int f = p.getIntParam("format");
+            int m = p.getIntParam("mb");
+
+            Nd4jLong l = 0;
+            ctx->setInputArray(0, NDArrayFactory::create_<Nd4jLong>(l), true);  //Max TS length (unused)
+
+
+            if (f == 0) {
+                //TNS format
+                ctx->setInputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);     //x
+                ctx->setOutputArray(0, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //i
+                ctx->setOutputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //c
+                ctx->setOutputArray(2, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //f
+                ctx->setOutputArray(3, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //o
+                ctx->setOutputArray(4, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //z
+                ctx->setOutputArray(5, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //h
+                ctx->setOutputArray(6, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //y
+            } else {
+                //NST format
+                ctx->setInputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);     //x
+                ctx->setOutputArray(0, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //i
+                ctx->setOutputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //c
+                ctx->setOutputArray(2, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //f
+                ctx->setOutputArray(3, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //o
+                ctx->setOutputArray(4, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //z
+                ctx->setOutputArray(5, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //h
+                ctx->setOutputArray(6, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //y
+            }
+
+            auto cLast = NDArrayFactory::create_<T>('c', {m, n});
+            auto yLast = NDArrayFactory::create_<T>('c', {m, n});
+            auto W = NDArrayFactory::create_<T>('c', {2 * n, 4 * n});
+            auto Wci = NDArrayFactory::create_<T>('c', {n});
+            auto Wcf = NDArrayFactory::create_<T>('c', {n});
+            auto Wco = NDArrayFactory::create_<T>('c', {n});
+            auto b = NDArrayFactory::create_<T>('c', {4 * n});
+
+            ctx->setInputArray(2, cLast, true);
+            ctx->setInputArray(3, yLast, true);
+            ctx->setInputArray(4, W, true);
+            ctx->setInputArray(5, Wci, true);
+            ctx->setInputArray(6, Wcf, true);
+            ctx->setInputArray(7, Wco, true);
+            ctx->setInputArray(8, b, true);
+
+            auto iargs = new Nd4jLong[2];
+            iargs[0] = 0;   //No peephole
+            iargs[1] = f;
+            ctx->setIArguments(iargs, 2);
+            delete[] iargs;
+
+            auto targs = new double[2];
+            targs[0] = 1.0; //forget bias
+            targs[1] = 0.0; //cell clipping value
+            ctx->setTArguments(targs, 2);
+            delete[] targs;
+            return ctx;
+        };
+
+        output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
+        return output;
+    }
+
+    static std::string broadcast2d() {
+        std::string output;
+        BenchmarkHelper helper(WARMUP, NUM_ITER);
+
+        int rows = 65536;
+        IntPowerParameters cols("cols", 2, 2, 12, 4);      //2^2 to 2^12 in steps of 2 - 2^1=2, ..., 2^10=1024
+        BoolParameters axis("axis");
+        BoolParameters inplace("inplace");
+
+        ParametersBatch batch({&cols, &axis, &inplace});
+
+        auto generator = PARAMETRIC_D() {
+            auto a = p.getIntParam("axis");
+            auto arr = NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")});
+
+            auto ctx = new Context(1);
+            ctx->setInputArray(0, arr, true);
+            if(a == 0){
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {rows, 1}), true);
+            } else {
+                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
+            }
+            if (p.getIntParam("inplace") == 1) {
+                ctx->setOutputArray(0, arr);
+                ctx->markInplace(true);
+            } else {
+                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")}), true);
+            }
+            return ctx;
+        };
+
+        std::string s("add");
+        nd4j::ops::add op;
+        DeclarableBenchmark benchmark(op, "add");
+        output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
+        return output;
+    }
+
+    std::string LightBenchmarkSuit::runSuit() {
+#ifdef _RELEASE
+        std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF});
+#else
+        std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32});
+#endif
+
+        std::string result;
+
+        for (auto t:dtypes) {
+            nd4j_printf("Running LightBenchmarkSuite.transformBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += transformBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.scalarBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += scalarBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.reduceDimBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += reduceDimBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.gemmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += gemmBenchmark, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.conv2d [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += conv2d, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.pool2d [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += pool2d, (), LIBND4J_TYPES);
+
+            nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
+            BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES);
+        }
+
+        nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", "");
+        result += broadcast2d();
+        nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", "");
+        result += mismatchedOrderAssign();
+
+        return result;
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index 08108d69c..95601ce4e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -311,3 +311,27 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_1) {
 
     delete result;
 }
+
+TEST_F(DeclarableOpsTests15, test_lstmBlock_2) {
+    int seqLength = 32;
+    int m = 64;
+    int n = 32;
+
+    auto x0 = NDArrayFactory::create<Nd4jLong>(5);
+    auto x1 = NDArrayFactory::create<float>('f', {m, n, seqLength});
+    auto x2 = NDArrayFactory::create<float>('f', {m, n});
+    auto x3 = NDArrayFactory::create<float>('f', {m, n});
+    auto x4 = NDArrayFactory::create<float>('f', {2 * n, 4 * n});
+    auto x5 = NDArrayFactory::create<float>('f', {n});
+    auto x6 = NDArrayFactory::create<float>('f', {n});
+    auto x7 = NDArrayFactory::create<float>('f', {n});
+    auto x8 = NDArrayFactory::create<float>('f', {4 * n});
+
+    nd4j::ops::lstmBlock op;
+    auto result = op.execute({&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8}, {1.0, 0.0}, {0, 1});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+
+    delete result;
+}
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index 67a2585e0..c94758c5a 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -38,6 +38,8 @@
 #include <helpers/ConstantShapeHelper.h>
 #include <helpers/ConstantTadHelper.h>
 #include <array>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <performance/benchmarking/LightBenchmarkSuit.h>
 
 using namespace nd4j;
 using namespace nd4j::graph;
@@ -164,6 +166,12 @@ TEST_F(PlaygroundTests, BroadcastOps2d) {
 }
  */
 
+TEST_F(PlaygroundTests, test_benchmark_suit_1) {
+    //LightBenchmarkSuit suit;
+    //auto output = suit.runSuit();
+    //nd4j_printf("SUIT OUTPUT\n%s\n", output.data());
+}
+
 TEST_F(PlaygroundTests, test_small_reductions) {
     auto f = NDArrayFactory::create<float>('c', {1024 ,1024});
     f.assign(1.0f);
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index 38e3b9523..1ac373676 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -193,6 +193,7 @@ if ("${OPENBLAS}" OR CMAKE_BUILD_TYPE STREQUAL "Release")
     endif()
 endif()
 
+file(GLOB_RECURSE PERF_SOURCES false ../../include/performance/*.cpp ../../include/performance/*.h)
 file(GLOB_RECURSE EXCEPTIONS_SOURCES false ../../include/exceptions/*.cpp ../../include/exceptions/*.h)
 file(GLOB_RECURSE EXEC_SOURCES false ../../include/execution/*.cpp ../../include/execution/*.h)
 file(GLOB_RECURSE TYPES_SOURCES false ../../include/types/*.cpp ../../include/types/*.h)
@@ -234,7 +235,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas
     ../../include/cnpy/cnpy.cpp  ../../include/nd4jmemset.h ../../include/nd4jmalloc.h
     ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
     ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES}
-    ${OPS_SOURCES} ${TEST_SOURCES})
+    ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES})
 
 target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES})
 
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
index 2f2f2478c..330fb1c31 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
@@ -917,4 +917,14 @@ public class DefaultOpExecutioner implements OpExecutioner {
     public DataBuffer createConstantBuffer(double[] values, DataType desiredType)  {
         throw new UnsupportedOperationException();
     }
+
+    @Override
+    public String runLightBenchmarkSuit(boolean printOut) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String runFullBenchmarkSuit(boolean printOut) {
+        throw new UnsupportedOperationException();
+    }
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java
index c4b39d653..1be417644 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/OpExecutioner.java
@@ -463,4 +463,8 @@ public interface OpExecutioner {
     DataBuffer createConstantBuffer(int[] values, DataType desiredType);
     DataBuffer createConstantBuffer(float[] values, DataType desiredType);
     DataBuffer createConstantBuffer(double[] values, DataType desiredType);
+
+
+    String runLightBenchmarkSuit(boolean printOut);
+    String runFullBenchmarkSuit(boolean printOut);
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
index 1ac932584..c6413d411 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
@@ -1386,15 +1386,39 @@ public class Nd4j {
      */
     public static DataBuffer createBufferDetached(int[] shape, DataType type) {
         long length = ArrayUtil.prodLong(shape);
-        if (type == DataType.INT)
-            return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
-        if (type == DataType.LONG)
-            return DATA_BUFFER_FACTORY_INSTANCE.createLong(new long[]{length});
-        else if (type == DataType.HALF)
-            return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length);
-
-        return type == DataType.DOUBLE ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
-
+        switch (type){
+            case DOUBLE:
+                return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
+            case FLOAT:
+                return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
+            case HALF:
+                return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length);
+            case BFLOAT16:
+                return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length);
+            case UINT64:
+                return DATA_BUFFER_FACTORY_INSTANCE.createULong(length);
+            case LONG:
+                return DATA_BUFFER_FACTORY_INSTANCE.createLong(length);
+            case UINT32:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length);
+            case INT:
+                return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
+            case UINT16:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length);
+            case SHORT:
+                return DATA_BUFFER_FACTORY_INSTANCE.createShort(length);
+            case UBYTE:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length);
+            case BYTE:
+                return DATA_BUFFER_FACTORY_INSTANCE.createByte(length);
+            case BOOL:
+                return DATA_BUFFER_FACTORY_INSTANCE.createBool(length);
+            case UTF8:
+            case COMPRESSED:
+            case UNKNOWN:
+            default:
+                throw new UnsupportedOperationException("Cannot create type: " + type);
+        }
     }
 
     /**
@@ -1403,16 +1427,39 @@ public class Nd4j {
     public static DataBuffer createBuffer(long[] shape, DataType type) {
         long length = ArrayUtil.prodLong(shape);
 
-        if (type == DataType.INT)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.LONG)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.HALF)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.DOUBLE)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+        switch (type) {
+            case BOOL:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBool(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case UBYTE:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case UINT16:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case UINT32:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createUInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case UINT64:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createULong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case BYTE:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createByte(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case SHORT:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createShort(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case INT:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case LONG:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case HALF:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case BFLOAT16:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case FLOAT:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case DOUBLE:
+                return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+            case UTF8:
+            case COMPRESSED:
+            case UNKNOWN:
+            default:
+                throw new UnsupportedOperationException("Cannot create type: " + type);
+        }
     }
 
 
@@ -1424,19 +1471,31 @@ public class Nd4j {
         switch (type){
 
             case DOUBLE:
-                DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
+                return DATA_BUFFER_FACTORY_INSTANCE.createDouble(length);
             case FLOAT:
-                DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
+                return DATA_BUFFER_FACTORY_INSTANCE.createFloat(length);
             case HALF:
                 return DATA_BUFFER_FACTORY_INSTANCE.createHalf(length);
+            case BFLOAT16:
+                return DATA_BUFFER_FACTORY_INSTANCE.createBFloat16(length);
+            case UINT64:
+                return DATA_BUFFER_FACTORY_INSTANCE.createULong(length);
             case LONG:
                 return DATA_BUFFER_FACTORY_INSTANCE.createLong(length);
+            case UINT32:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUInt(length);
             case INT:
                 return DATA_BUFFER_FACTORY_INSTANCE.createInt(length);
+            case UINT16:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUShort(length);
             case SHORT:
+                return DATA_BUFFER_FACTORY_INSTANCE.createShort(length);
             case UBYTE:
+                return DATA_BUFFER_FACTORY_INSTANCE.createUByte(length);
             case BYTE:
+                return DATA_BUFFER_FACTORY_INSTANCE.createByte(length);
             case BOOL:
+                return DATA_BUFFER_FACTORY_INSTANCE.createBool(length);
             case UTF8:
             case COMPRESSED:
             case UNKNOWN:
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
index 278a4e39f..e5990f981 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
@@ -1161,4 +1161,10 @@ public abstract class NativeOps extends Pointer {
     public abstract Pointer constantBuffer(int dtype, DoublePointer data, int length);
 
     public abstract Pointer constantBuffer(int dtype, @Cast("Nd4jLong *") LongPointer data, int length);
+
+    public abstract String runLightBenchmarkSuit(boolean printOut);
+
+    public abstract String runFullBenchmarkSuit(boolean printOut);
+
+    public abstract long getCachedMemory(int deviceId);
 }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java
index dcb644468..4b4bff588 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/buffer/factory/CudaDataBufferFactory.java
@@ -418,6 +418,126 @@ public class CudaDataBufferFactory implements DataBufferFactory {
         return new CudaIntDataBuffer(length);
     }
 
+    @Override
+    public DataBuffer createBFloat16(long length) {
+        return new CudaBfloat16DataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length) {
+        return new CudaUInt32DataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length) {
+        return new CudaUInt16DataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length) {
+        return new CudaUByteDataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createULong(long length) {
+        return new CudaUInt64DataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createBool(long length) {
+        return new CudaBoolDataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createShort(long length) {
+        return new CudaShortDataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createByte(long length) {
+        return new CudaByteDataBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createBFloat16(long length, boolean initialize) {
+        return new CudaBfloat16DataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length, boolean initialize) {
+        return new CudaUInt32DataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length, boolean initialize) {
+        return new CudaUInt16DataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length, boolean initialize) {
+        return new CudaUByteDataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createULong(long length, boolean initialize) {
+        return new CudaUInt64DataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createBool(long length, boolean initialize) {
+        return new CudaBoolDataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createShort(long length, boolean initialize) {
+        return new CudaShortDataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createByte(long length, boolean initialize) {
+        return new CudaByteDataBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaBfloat16DataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaUInt32DataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaUInt16DataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaUByteDataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaUInt64DataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaBoolDataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaShortDataBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace) {
+        return new CudaByteDataBuffer(length, initialize, workspace);
+    }
+
     @Override
     public DataBuffer createInt(long length, boolean initialize) {
         return new CudaIntDataBuffer(length, initialize);
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
index bcfa7b22d..3145fd8c2 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
@@ -2757,6 +2757,16 @@ public class CudaExecutioner extends DefaultOpExecutioner {
 
         return buffer;
     }
+
+    @Override
+    public String runLightBenchmarkSuit(boolean printOut) {
+        return nativeOps.runLightBenchmarkSuit(printOut);
+    }
+
+    @Override
+    public String runFullBenchmarkSuit(boolean printOut) {
+        return nativeOps.runFullBenchmarkSuit(printOut);
+    }
 }
 
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index 3f7794074..4f11acb7c 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
      */
     public native int getDeviceMajor(int deviceId);
 
+    /**
+     * This method returns amount of cached memory
+     * @param deviceId
+     * @return
+     */
+    public native @Cast("Nd4jLong") long getCachedMemory(int deviceId);
+
     /**
      *
      * @param ptrToDeviceId
@@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
 
     public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId);
 
+    public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer);
@@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length);
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
+
+
+    public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut);
+    public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut);
 }
 
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
index dd44914be..238209e88 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
@@ -2185,4 +2185,15 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
             sb.append(". Output var names: ").append(Arrays.toString(outNames));
         }
     }
+
+
+    @Override
+    public String runLightBenchmarkSuit(boolean printOut) {
+        return loop.runLightBenchmarkSuit(printOut);
+    }
+
+    @Override
+    public String runFullBenchmarkSuit(boolean printOut) {
+        return loop.runFullBenchmarkSuit(printOut);
+    }
 }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index fcde146d3..5cc1a46a0 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -1977,6 +1977,13 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
      */
     public native int getDeviceMajor(int deviceId);
 
+    /**
+     * This method returns amount of cached memory
+     * @param deviceId
+     * @return
+     */
+    public native @Cast("Nd4jLong") long getCachedMemory(int deviceId);
+
     /**
      *
      * @param ptrToDeviceId
@@ -2976,6 +2983,7 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
 
     public native int unregisterGraph(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong") long graphId);
 
+    public native void deleteCharArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deleteIntArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deleteLongArray(@Cast("Nd4jPointer") Pointer pointer);
     public native void deletePointerArray(@Cast("Nd4jPointer") Pointer pointer);
@@ -3038,6 +3046,10 @@ public static class NativeOps extends org.nd4j.nativeblas.NativeOps {
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, double[] data, int length);
     public native ConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
+
+
+    public native @Cast("char*") String runLightBenchmarkSuit(@Cast("bool") boolean printOut);
+    public native @Cast("char*") String runFullBenchmarkSuit(@Cast("bool") boolean printOut);
 }
 
 
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
index 5688aa611..1915722b4 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
@@ -3306,6 +3306,28 @@ public class Nd4jTestsC extends BaseNd4jTest {
         log.info("arrayf data: {}", Arrays.toString(arrayf.data().asFloat()));
     }
 
+    @Test
+    public void testCreateDetached_1() {
+        val shape = new int[]{10};
+        val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF};
+
+        for(DataType dt : dataTypes){
+            val dataBuffer = Nd4j.createBufferDetached(shape, dt);
+            assertEquals(dt, dataBuffer.dataType());
+        }
+    }
+
+    @Test
+    public void testCreateDetached_2() {
+        val shape = new long[]{10};
+        val dataTypes = new DataType[] {DataType.DOUBLE, DataType.BOOL, DataType.BYTE, DataType.UBYTE, DataType.SHORT, DataType.UINT16, DataType.INT, DataType.UINT32, DataType.LONG, DataType.UINT64, DataType.FLOAT, DataType.BFLOAT16, DataType.HALF};
+
+        for(DataType dt : dataTypes){
+            val dataBuffer = Nd4j.createBufferDetached(shape, dt);
+            assertEquals(dt, dataBuffer.dataType());
+        }
+    }
+
     @Test
     public void testPairwiseMixedC() {
         int[] shape2 = {12, 8};
@@ -7889,6 +7911,7 @@ public class Nd4jTestsC extends BaseNd4jTest {
         assertEquals(Nd4j.createFromArray(1f, 3f, 4f), out);
     }
 
+
     private static INDArray fwd(INDArray input, INDArray W, INDArray b){
         INDArray ret = Nd4j.createUninitialized(input.size(0), W.size(1));
         input.mmuli(W, ret);
diff --git a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java
index 1a2ec6f37..743f34655 100644
--- a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java
+++ b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DataBufferFactory.java
@@ -355,6 +355,7 @@ public interface DataBufferFactory {
 
     DataBuffer create(DataType dataType, long length, boolean initialize, MemoryWorkspace workspace);
 
+
     /**
      * Create an int data buffer
      *
@@ -363,6 +364,33 @@ public interface DataBufferFactory {
      */
     DataBuffer createInt(long length);
 
+    DataBuffer createBFloat16(long length);
+    DataBuffer createByte(long length);
+    DataBuffer createShort(long length);
+    DataBuffer createBool(long length);
+    DataBuffer createUShort(long length);
+    DataBuffer createUInt(long length);
+    DataBuffer createUByte(long length);
+    DataBuffer createULong(long length);
+
+    DataBuffer createBFloat16(long length, boolean initialize);
+    DataBuffer createByte(long length, boolean initialize);
+    DataBuffer createShort(long length, boolean initialize);
+    DataBuffer createBool(long length, boolean initialize);
+    DataBuffer createUShort(long length, boolean initialize);
+    DataBuffer createUInt(long length, boolean initialize);
+    DataBuffer createUByte(long length, boolean initialize);
+    DataBuffer createULong(long length, boolean initialize);
+
+    DataBuffer createBFloat16(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createByte(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createShort(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createBool(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createUShort(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createUInt(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createUByte(long length, boolean initialize, MemoryWorkspace workspace);
+    DataBuffer createULong(long length, boolean initialize, MemoryWorkspace workspace);
+
     /**
      * Create an int data buffer, with optional initialization
      *
diff --git a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java
index 2bb49716e..96b154338 100644
--- a/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java
+++ b/nd4j/nd4j-buffer/src/main/java/org/nd4j/linalg/api/buffer/factory/DefaultDataBufferFactory.java
@@ -354,11 +354,132 @@ public class DefaultDataBufferFactory implements DataBufferFactory {
         return new IntBuffer(length);
     }
 
+    @Override
+    public DataBuffer createBFloat16(long length) {
+        return new BFloat16Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length) {
+        return new UInt32Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length) {
+        return new UInt16Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length) {
+        return new UInt8Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createULong(long length) {
+        return new UInt64Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createBool(long length) {
+        return new BoolBuffer(length);
+    }
+
+    @Override
+    public DataBuffer createShort(long length) {
+        return new Int16Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createByte(long length) {
+        return new Int8Buffer(length);
+    }
+
+    @Override
+    public DataBuffer createBFloat16(long length, boolean initialize) {
+        return new BFloat16Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length, boolean initialize) {
+        return new UInt32Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length, boolean initialize) {
+        return new UInt16Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length, boolean initialize) {
+        return new UInt8Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createULong(long length, boolean initialize) {
+        return new UInt64Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createBool(long length, boolean initialize) {
+        return new BoolBuffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createShort(long length, boolean initialize) {
+        return new Int16Buffer(length, initialize);
+    }
+
+    @Override
+    public DataBuffer createByte(long length, boolean initialize) {
+        return new Int8Buffer(length, initialize);
+    }
+
     @Override
     public DataBuffer createInt(long length, boolean initialize) {
         return new IntBuffer(length, initialize);
     }
 
+    @Override
+    public DataBuffer createBFloat16(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new BFloat16Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUInt(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new UInt32Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUShort(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new UInt16Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createUByte(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new UInt8Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createULong(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new UInt64Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createBool(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new BoolBuffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createShort(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new Int16Buffer(length, initialize, workspace);
+    }
+
+    @Override
+    public DataBuffer createByte(long length, boolean initialize,  MemoryWorkspace workspace) {
+        return new Int8Buffer(length, initialize, workspace);
+    }
+
+
     @Override
     public DataBuffer createInt(long length, boolean initialize, MemoryWorkspace workspace) {
         return new IntBuffer(length, initialize, workspace);
@@ -665,12 +786,12 @@ public class DefaultDataBufferFactory implements DataBufferFactory {
 
     @Override
     public DataBuffer createHalf(long length) {
-        throw new UnsupportedOperationException("FP16 isn't supported for CPU yet");
+        return new HalfBuffer(length);
     }
 
     @Override
     public DataBuffer createHalf(long length, boolean initialize) {
-        throw new UnsupportedOperationException("FP16 isn't supported for CPU yet");
+        return new HalfBuffer(length, initialize);
     }
 
     /**