From 7783012f395ddb31459ef5726cb9b03ba7f52cad Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Mon, 20 Jan 2020 21:32:46 +0300
Subject: [PATCH] cuDNN integration (#150)

* initial commit

Signed-off-by: raver119 <raver119@gmail.com>

* one file

Signed-off-by: raver119 <raver119@gmail.com>

* few more includes

Signed-off-by: raver119 <raver119@gmail.com>

* m?

Signed-off-by: raver119 <raver119@gmail.com>

* const

Signed-off-by: raver119 <raver119@gmail.com>

* cudnn linkage in tests

Signed-off-by: raver119 <raver119@gmail.com>

* culibos

Signed-off-by: raver119 <raver119@gmail.com>

* static reminder

Signed-off-by: raver119 <raver119@gmail.com>

* platform engine tag

Signed-off-by: raver119 <raver119@gmail.com>

* HAVE_CUDNN moved to config.h.in

Signed-off-by: raver119 <raver119@gmail.com>

* include

Signed-off-by: raver119 <raver119@gmail.com>

* include

Signed-off-by: raver119 <raver119@gmail.com>

* skip cudnn handle creation if there's not cudnn

Signed-off-by: raver119 <raver119@gmail.com>

* meh

Signed-off-by: raver119 <raver119@gmail.com>

* target device in context

Signed-off-by: raver119 <raver119@gmail.com>

* platform engines

Signed-off-by: raver119 <raver119@gmail.com>

* platform engines

Signed-off-by: raver119 <raver119@gmail.com>

* allow multiple -h args

Signed-off-by: raver119 <raver119@gmail.com>

* allow multiple -h args

Signed-off-by: raver119 <raver119@gmail.com>

* move mkldnn out of CPU block

Signed-off-by: raver119 <raver119@gmail.com>

* link to mkldnn on cuda

Signed-off-by: raver119 <raver119@gmail.com>

* less prints

Signed-off-by: raver119 <raver119@gmail.com>

* minor tweaks

Signed-off-by: raver119 <raver119@gmail.com>

* next step

Signed-off-by: raver119 <raver119@gmail.com>

* conv2d NCHW draft

Signed-off-by: raver119 <raver119@gmail.com>

* conv2d biasAdd

Signed-off-by: raver119 <raver119@gmail.com>

* test for MKL/CUDNN combined use

Signed-off-by: raver119 <raver119@gmail.com>

* - provide additional code for conv2d ff based on cudnn api, not tested yet

Signed-off-by: Yurii <iuriish@yahoo.com>

* - further work on conv2d helper based on using cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - fixing several cuda bugs which appeared after cudnn lib had been started to use

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of conv2d backprop op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementaion of conv3d and conv3d_bp ops based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - bugs fixing in conv3d/conv3d_bp ops (cudnn in use)

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of depthwiseConv2d (ff/bp) op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of batchnorm ff op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - disable cudnn batchnorm temporary

Signed-off-by: Yurii <iuriish@yahoo.com>

* - add minor change in cmake

Signed-off-by: Yurii <iuriish@yahoo.com>

* engine for depthwise mkldnn

Signed-off-by: raver119 <raver119@gmail.com>

* couple of includes

Signed-off-by: raver119 <raver119@gmail.com>

* - provide permutation to cudnn batchnorm ff when format is NHWC

Signed-off-by: Yurii <iuriish@yahoo.com>

* lgamma fix

Signed-off-by: raver119 <raver119@gmail.com>

* - eliminate memory leak in two tests

Signed-off-by: Yurii <iuriish@yahoo.com>

Co-authored-by: Yurii Shyrma <iuriish@yahoo.com>
---
 libnd4j/CMakeLists.txt                        |  93 +++-
 libnd4j/blas/CMakeLists.txt                   |  26 +-
 libnd4j/blas/NDArray.h                        |   1 +
 libnd4j/blas/NDArray.hpp                      |  12 +-
 libnd4j/blas/cuda/NativeOpExecutioner.cu      |  28 +-
 libnd4j/buildnativeoperations.sh              |  13 +-
 libnd4j/include/config.h.in                   |   4 +
 libnd4j/include/execution/Engine.h            |  31 ++
 libnd4j/include/execution/Executor.h          |   6 +-
 libnd4j/include/execution/LaunchContext.h     |   2 +
 .../include/execution/cuda/LaunchContext.cu   |   4 +
 libnd4j/include/graph/Context.h               |   4 +-
 libnd4j/include/graph/ContextPrototype.h      |   9 +
 libnd4j/include/graph/impl/Context.cpp        |   4 +
 .../include/graph/impl/ContextPrototype.cpp   |   4 +
 libnd4j/include/helpers/cublasHelper.h        |   2 +
 .../include/helpers/cuda_off/cublasHelper.cu  |  29 +
 libnd4j/include/loops/cuda/indexreduce.cu     |  51 +-
 libnd4j/include/memory/cuda/Workspace.cu      |   2 +-
 .../include/ops/declarable/OpRegistrator.h    |   9 +-
 .../include/ops/declarable/PlatformHelper.h   |   8 +-
 .../declarable/generic/nn/convo/conv3d.cpp    |  14 +-
 .../ops/declarable/helpers/convolutions.h     |  12 +-
 .../declarable/helpers/cuda/image_resize.cu   |   6 +-
 .../ops/declarable/helpers/cuda/lgamma.cu     |   2 +-
 .../ops/declarable/impl/DeclarableOp.cpp      |   4 +-
 .../ops/declarable/impl/LegacyScalarOp.cpp    |   6 +-
 .../ops/declarable/impl/OpRegistrator.cpp     |  21 +-
 .../ops/declarable/impl/PlatformHelper.cpp    |   7 +-
 .../declarable/platform/cudnn/batchnorm.cu    | 275 +++++++++
 .../ops/declarable/platform/cudnn/conv2d.cu   | 521 ++++++++++++++++++
 .../ops/declarable/platform/cudnn/conv3d.cu   | 453 +++++++++++++++
 .../declarable/platform/cudnn/cudnnUtils.h    | 158 ++++++
 .../platform/cudnn/depthwiseConv2d.cu         | 443 +++++++++++++++
 .../platform/mkldnn/avgpooling2d.cpp          |   5 +-
 .../platform/mkldnn/avgpooling2d_bp.cpp       |   4 +-
 .../platform/mkldnn/avgpooling3d.cpp          |   4 +-
 .../platform/mkldnn/avgpooling3d_bp.cpp       |   4 +-
 .../declarable/platform/mkldnn/batchnorm.cpp  |   8 +-
 .../ops/declarable/platform/mkldnn/conv2d.cpp |   8 +-
 .../ops/declarable/platform/mkldnn/conv3d.cpp |   8 +-
 .../declarable/platform/mkldnn/deconv2d.cpp   |   8 +-
 .../platform/mkldnn/deconv2d_tf.cpp           |   4 +-
 .../declarable/platform/mkldnn/deconv3d.cpp   |   8 +-
 .../platform/mkldnn/depthwiseConv2d.cpp       |   8 +-
 .../ops/declarable/platform/mkldnn/lrn.cpp    |   4 +-
 .../declarable/platform/mkldnn/lstmLayer.cpp  |   4 +-
 .../platform/mkldnn/maxpooling2d.cpp          |   4 +-
 .../platform/mkldnn/maxpooling2d_bp.cpp       |   4 +-
 .../platform/mkldnn/maxpooling3d.cpp          |   4 +-
 .../platform/mkldnn/maxpooling_3d_bp.cpp      |   4 +-
 .../declarable/platform/mkldnn/mkldnnUtils.h  |  49 +-
 libnd4j/include/platform_boilerplate.h        |  46 +-
 libnd4j/include/play.h                        |   8 +-
 libnd4j/tests_cpu/layers_tests/CMakeLists.txt |  10 +-
 .../layers_tests/ConvolutionTests1.cpp        | 109 +++-
 .../layers_tests/ConvolutionTests2.cpp        |   2 +-
 libnd4j/tests_cpu/layers_tests/CuDnnTests.cu  | 128 +++++
 .../layers_tests/DeclarableOpsTests10.cpp     | 203 -------
 .../layers_tests/DeclarableOpsTests13.cpp     | 299 ++++++++++
 .../tests_cpu/layers_tests/MklDnnTests.cpp    |  28 +-
 .../layers_tests/NDArrayCudaBasicsTests.cu    |   2 +
 .../tests_cpu/libnd4j_tests/CMakeLists.txt    |   2 +-
 .../org/nd4j/nativeblas/Nd4jCudaPresets.java  |   1 +
 .../org/nd4j/nativeblas/Nd4jCpuPresets.java   |   1 +
 65 files changed, 2810 insertions(+), 435 deletions(-)
 create mode 100644 libnd4j/include/execution/Engine.h
 create mode 100644 libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
 create mode 100644 libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
 create mode 100644 libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
 create mode 100644 libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
 create mode 100644 libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
 create mode 100644 libnd4j/tests_cpu/layers_tests/CuDnnTests.cu

diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index 6ee102cb3..c82b0b217 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -14,6 +14,10 @@ set(CMAKE_CXX_STANDARD 11)
 if (CUDA_BLAS)
     enable_language(CUDA)
     set(CMAKE_CUDA_STANDARD 11)
+
+    set(DEFAULT_ENGINE "samediff::ENGINE_CUDA")
+else()
+    set(DEFAULT_ENGINE "samediff::ENGINE_CPU")
 endif()
 
 # MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
@@ -60,6 +64,7 @@ if(NATIVE)
     ENDIF()
 endif()
 
+
 if(NOT CUDA_BLAS)
     # we need this definition to avoid global memory use within mkldnn
     add_definitions(-DDNNL_ENABLE_CONCURRENT_EXEC=true)
@@ -128,36 +133,70 @@ if(NOT CUDA_BLAS)
         include_directories(${CPUF_SOURCE_DIR}/include)
         set(CPU_FEATURES cpu_features)
     endif()
+endif()
 
-    # new mkl-dnn entry
-    if (${HELPERS_mkldnn})
-        message("Going to pull & build mkldnn")
-        set(HAVE_MKLDNN 1)
-        set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE)
 
-        configure_file(./CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt)
-        execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-                    RESULT_VARIABLE result
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
-        if(result)
-            message(FATAL_ERROR "CMake step for mkldnn failed: ${result}")
-        endif()
-        execute_process(COMMAND ${CMAKE_COMMAND} --build .
-                    RESULT_VARIABLE result
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
-        if(result)
-            message(FATAL_ERROR "Build step for mkldnn failed: ${result}")
-        endif()
+# new mkl-dnn entry
+if (${HELPERS_mkldnn})
+    message("Going to pull & build mkldnn")
+    set(HAVE_MKLDNN 1)
+    set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE)
 
-        add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src
-                    ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build
-                    EXCLUDE_FROM_ALL)
+    configure_file(./CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
+    if(result)
+        message(FATAL_ERROR "CMake step for mkldnn failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
+    if(result)
+        message(FATAL_ERROR "Build step for mkldnn failed: ${result}")
+    endif()
 
-        set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build)
-        set(mkldnn_EXT_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src)
-        set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}")
-        include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR})
-        set(MKLDNN dnnl)
+    add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src
+            ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build
+            EXCLUDE_FROM_ALL)
+
+    set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build)
+    set(mkldnn_EXT_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src)
+    set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}")
+    include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR})
+    set(MKLDNN dnnl)
+endif()
+
+
+if (${HELPERS_cudnn})
+    if (NOT CUDA_BLAS)
+        message(FATAL_ERROR "Can't build cuDNN on non-CUDA platform")
+    endif()
+
+    set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
+
+    # FIXME: we don't want static library in master
+    SET(CUDNN_LIBNAME "cudnn")
+    SET(CULIBOS_LIBNAME "culibos")
+
+    find_path(CUDNN_INCLUDE_DIR cudnn.h
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES cuda/include include)
+
+    find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+    find_library(CULIBOS_LIBRARY ${CULIBOS_LIBNAME}
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+
+    if (CUDNN_LIBRARY)
+        set(HAVE_CUDNN true)
+        set(CUDNN ${CUDNN_LIBRARY} ${CULIBOS_LIBRARY})
+    else()
+        message(FATAL_ERROR "Unable to find cuDNN")
     endif()
 endif()
 
@@ -185,6 +224,8 @@ set(HAVE_FLATBUFFERS 1)
 set(FLATBUFFERS_PATH ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src)
 include_directories(${FLATBUFFERS_PATH}/include)
 
+
+
 configure_file(include/config.h.in include/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index d07ec57bc..c1c5de399 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -131,6 +131,11 @@ if(!CUDA_BLAS)
     endif()
 endif()
 
+#if MKLDNN is enabled - we're building mkldnn-powered helpers
+if (HAVE_MKLDNN)
+    file(GLOB_RECURSE CUSTOMOPS_MKLDNN_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
+endif()
+
 if(CUDA_BLAS)
     message("Build cublas")
     find_package(CUDA)
@@ -210,16 +215,23 @@ if(CUDA_BLAS)
         file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
         file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)
 
+        if (HAVE_CUDNN)
+            message("cuDNN included")
+            file(GLOB_RECURSE CUSTOMOPS_CUDNN_SOURCES false ../include/ops/declarable/platform/cudnn/*.cu)
+        endif()
+
 		add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
                 ${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
                 ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
                 cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
                 Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
+                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})
 
         add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
 
-        message("MSVC runtime for library: ${MSVC_RT_LIB}")
+        if (WIN32)
+            message("MSVC runtime for library: ${MSVC_RT_LIB}")
+        endif()
 
         # static library is built only if we're going to build tests, skip otherwise
         if (BUILD_TESTS)
@@ -237,7 +249,7 @@ if(CUDA_BLAS)
             SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
         endif()
 
-		target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY})
+		target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN})
 	    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)
 
 		install(TARGETS ${LIBND4J_NAME} DESTINATION .)
@@ -264,12 +276,6 @@ elseif(CPU_BLAS)
     file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/*.cpp ../include/helpers/*.h)
     file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/*.cpp ../include/loops/*.h)
 
-
-    #if MKLDNN is enabled - we're building mkldnn-powered helpers
-    if (HAVE_MKLDNN)
-        file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
-    endif()
-
     if (X86_BUILD)
         # we disable platform optimizations for certains files for linux/macos
         set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
@@ -282,7 +288,7 @@ elseif(CPU_BLAS)
             cpu/NativeOpExecutioner.cpp cpu/NDArray.cpp cpu/NDArrayFactory.cpp
             ../include/cnpy/cnpy.cpp  ../include/nd4jmemset.h ../include/nd4jmalloc.h
             Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
+            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
             ${OPS_SOURCES} ${PERF_SOURCES})
     if(IOS)
         add_library(${LIBND4J_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index a426e471e..037e6e1d0 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -940,6 +940,7 @@ namespace nd4j {
         template <typename T>
         std::vector<T> getBufferAsVector();
         std::vector<Nd4jLong> getShapeAsVector() const;
+        std::vector<int> getShapeAsVectorInt() const;
         std::vector<Nd4jLong> getShapeInfoAsVector();
         std::vector<int64_t> getShapeInfoAsFlatVector();
         std::vector<int64_t> getShapeAsFlatVector();
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index b79f52fb3..42b29cf78 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -444,6 +444,16 @@ std::vector<Nd4jLong> NDArray::getShapeAsVector() const {
     return vector;
 }
 
+////////////////////////////////////////////////////////////////////////
+std::vector<int> NDArray::getShapeAsVectorInt() const {
+
+    std::vector<int> vector(this->rankOf());
+    for (int e = 0; e < this->rankOf(); e++)
+        vector[e] = static_cast<int>(this->sizeAt(e));
+
+    return vector;
+}
+
 ////////////////////////////////////////////////////////////////////////
 std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() {
     int magicNumber = shape::shapeInfoLength(this->rankOf());
@@ -625,7 +635,7 @@ void NDArray::assign(const NDArray& other, bool allowParallelism) {
         if (other.lengthOf() != lengthOf()) {
             auto shapeThis = ShapeUtils::shapeAsString(this);
             auto shapeThat = ShapeUtils::shapeAsString(&other);
-            nd4j_printf("Can't assign new value to the array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
+            nd4j_printf("Can't assign array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
             throw std::runtime_error("NDArray::assign: lengths of arrays are mismatched");
         }
 
diff --git a/libnd4j/blas/cuda/NativeOpExecutioner.cu b/libnd4j/blas/cuda/NativeOpExecutioner.cu
index 1f074f39b..1e0685dc4 100644
--- a/libnd4j/blas/cuda/NativeOpExecutioner.cu
+++ b/libnd4j/blas/cuda/NativeOpExecutioner.cu
@@ -488,7 +488,7 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
         throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 8192);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);
 
     BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
 
@@ -523,7 +523,7 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
 
     auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
 
@@ -559,7 +559,7 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
 
     auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
 
@@ -601,7 +601,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
 	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 	auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     if (zType != nd4j::DataType::INT64 && zType != nd4j::DataType::INT32)
         throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
@@ -647,7 +647,7 @@ void  NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
 
     auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -684,7 +684,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
 	if (nd4j::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
 		printf("AF1 opNum:[%i]\n", opNum);
@@ -734,7 +734,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -766,7 +766,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);
 
@@ -797,7 +797,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);
 
@@ -828,7 +828,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);
 
@@ -1085,7 +1085,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
 
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     if (xType != yType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
@@ -1135,7 +1135,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
 
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
                                                                     dX, dXShapeInfo,
@@ -1177,7 +1177,7 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     if (xType != yType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
@@ -1595,7 +1595,7 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
 
     auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
 
diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh
index 119b04f93..a8b45e918 100755
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@@ -489,6 +489,7 @@ mkbuilddir() {
     cd "blasbuild/$CHIP"
 }
 
+HELPERS=""
 if [ "$HELPER" == "" ]; then
   echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
   echo "!!                                                                                                           !!"
@@ -503,6 +504,14 @@ if [ "$HELPER" == "" ]; then
   echo "!!                                                                                                           !!"
   echo "!!                                                                                                           !!"
   echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+else
+  #  if helpers were defined, we'll propagate them to CMake
+  IFS=','
+  read -ra HLP <<< "$HELPER"
+  for i in "${HLP[@]}"; do
+    HELPERS="${HELPERS} -DHELPERS_$i=true"
+  done
+  IFS=' '
 fi
 
 echo PACKAGING  = "${PACKAGING}"
@@ -519,10 +528,10 @@ echo MINIFIER = "${MINIFIER_ARG}"
 echo TESTS = "${TESTS_ARG}"
 echo NAME = "${NAME_ARG}"
 echo OPENBLAS_PATH = "$OPENBLAS_PATH"
-echo HELPERS = "$HELPER"
+echo HELPERS = "$HELPERS"
 mkbuilddir
 pwd
-eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DHELPERS_"$HELPER"=true "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
 if [ "$PARALLEL" == "true" ]; then
     MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
 fi
diff --git a/libnd4j/include/config.h.in b/libnd4j/include/config.h.in
index bdba3cc03..1e63552d0 100644
--- a/libnd4j/include/config.h.in
+++ b/libnd4j/include/config.h.in
@@ -13,4 +13,8 @@
 
 #cmakedefine FLATBUFFERS_PATH "@FLATBUFFERS_PATH@"
 
+#cmakedefine HAVE_CUDNN
+
+#cmakedefine DEFAULT_ENGINE @DEFAULT_ENGINE@
+
 #endif
diff --git a/libnd4j/include/execution/Engine.h b/libnd4j/include/execution/Engine.h
new file mode 100644
index 000000000..cd30867a9
--- /dev/null
+++ b/libnd4j/include/execution/Engine.h
@@ -0,0 +1,31 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SD_ENGINE_H
+#define SD_ENGINE_H
+
+namespace samediff {
+    enum Engine {
+        ENGINE_CPU = 0,
+        ENGINE_CUDA = 1,
+    };
+}
+
+#endif //SD_ENGINE_H
diff --git a/libnd4j/include/execution/Executor.h b/libnd4j/include/execution/Executor.h
index 8922e345d..26d5365ad 100644
--- a/libnd4j/include/execution/Executor.h
+++ b/libnd4j/include/execution/Executor.h
@@ -18,8 +18,8 @@
 // @author raver119@gmail.com
 //
 
-#ifndef DEV_TESTS_EXECUTOR_H
-#define DEV_TESTS_EXECUTOR_H
+#ifndef SD_EXECUTOR_H
+#define SD_EXECUTOR_H
 
 namespace nd4j {
     class Executor {
@@ -30,4 +30,4 @@ namespace nd4j {
     };
 }
 
-#endif //DEV_TESTS_EXECUTOR_H
+#endif //SD_EXECUTOR_H
diff --git a/libnd4j/include/execution/LaunchContext.h b/libnd4j/include/execution/LaunchContext.h
index 076e2933b..689d79369 100644
--- a/libnd4j/include/execution/LaunchContext.h
+++ b/libnd4j/include/execution/LaunchContext.h
@@ -27,6 +27,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <cuda_device_runtime_api.h>
+#include "config.h"
 #endif
 
 // used for MKLDNN etc
@@ -81,6 +82,7 @@ class ND4J_EXPORT LaunchContext {
 		int* getAllocationPointer() const;
 		void* getCublasHandle() const;
 		void* getCusolverHandle() const;
+		void* getCuDnnHandle() const;
 		cudaStream_t* getCudaStream() const;
 		cudaStream_t* getCudaSpecialStream() const;
 
diff --git a/libnd4j/include/execution/cuda/LaunchContext.cu b/libnd4j/include/execution/cuda/LaunchContext.cu
index 5e2ac589c..3145ca8d3 100644
--- a/libnd4j/include/execution/cuda/LaunchContext.cu
+++ b/libnd4j/include/execution/cuda/LaunchContext.cu
@@ -166,6 +166,10 @@ LaunchContext::LaunchContext() {
         return contextBuffers.isInitialized();
     }
 
+    void* LaunchContext::getCuDnnHandle() const {
+        return CublasHelper::getInstance()->cudnn();
+    }
+
     sd::ErrorReference* LaunchContext::errorReference() {
         return contextBuffers.errorReference();
     }
diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h
index 57988da79..f4fa6d16d 100644
--- a/libnd4j/include/graph/Context.h
+++ b/libnd4j/include/graph/Context.h
@@ -27,6 +27,7 @@
 #include <graph/VariableSpace.h>
 #include <graph/ContextPrototype.h>
 #include <memory/Workspace.h>
+#include <execution/Engine.h>
 
 // CUDA-specific includes
 #ifdef __CUDACC__
@@ -102,12 +103,13 @@ namespace nd4j {
             // this method returns workspace for object allocations
             nd4j::memory::Workspace* oWorkspace();
 
-
             void setVariableSpace(VariableSpace* variableSpace);
 
             nd4j::random::RandomBuffer* getRNG();
             void setRNG(nd4j::random::RandomBuffer* rng);
 
+            void setTargetEngine(samediff::Engine engine);
+
             VariableSpace *getVariableSpace();
 
             LaunchContext* launchContext();
diff --git a/libnd4j/include/graph/ContextPrototype.h b/libnd4j/include/graph/ContextPrototype.h
index 8aaa3e3d2..a9d05b7b4 100644
--- a/libnd4j/include/graph/ContextPrototype.h
+++ b/libnd4j/include/graph/ContextPrototype.h
@@ -27,6 +27,11 @@
 #include <dll.h>
 #include <RandomGenerator.h>
 #include <ops/declarable/OpDescriptor.h>
+#include <execution/Engine.h>
+
+#ifndef __STANDALONE_BUILD__
+#include <config.h>
+#endif
 
 namespace nd4j {
     namespace graph {
@@ -53,6 +58,8 @@ namespace nd4j {
             nd4j::ops::OpDescriptor* _opDescriptor;
             bool _useMKLDNN = nd4j::Environment::getInstance()->isUseMKLDNN();
 
+            // target engine for execution
+            samediff::Engine _engine = DEFAULT_ENGINE;
         public:
             explicit ContextPrototype(nd4j::ops::OpDescriptor* opDescriptor = nullptr, int nodeId = 1, bool inPlace = false);
             ~ContextPrototype() = default;
@@ -84,6 +91,8 @@ namespace nd4j {
             std::vector<bool>* getBArguments();
             std::vector<int>* getAxis();
 
+            samediff::Engine engine();
+
             size_t numT();
             size_t numI();
             size_t numB();
diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp
index 2725a2667..4876675dc 100644
--- a/libnd4j/include/graph/impl/Context.cpp
+++ b/libnd4j/include/graph/impl/Context.cpp
@@ -107,6 +107,10 @@ namespace nd4j {
                 delete _context;
         }
 
+        void Context::setTargetEngine(samediff::Engine engine) {
+            _engine = engine;
+        }
+
         bool Context::hasWorkspaceProvided() {
             return this->_workspace != nullptr;
         }
diff --git a/libnd4j/include/graph/impl/ContextPrototype.cpp b/libnd4j/include/graph/impl/ContextPrototype.cpp
index 5bd2a69e7..0ddde97f4 100644
--- a/libnd4j/include/graph/impl/ContextPrototype.cpp
+++ b/libnd4j/include/graph/impl/ContextPrototype.cpp
@@ -59,6 +59,10 @@ namespace nd4j {
             }
         }
 
+        samediff::Engine ContextPrototype::engine() {
+            return _engine;
+        }
+
         bool ContextPrototype::hasVariablesFilled() {
             return this->_inputs.size() > 0;
         }
diff --git a/libnd4j/include/helpers/cublasHelper.h b/libnd4j/include/helpers/cublasHelper.h
index 53d30abf6..f07cc178c 100644
--- a/libnd4j/include/helpers/cublasHelper.h
+++ b/libnd4j/include/helpers/cublasHelper.h
@@ -34,12 +34,14 @@ namespace nd4j {
 
         std::vector<void*> _cache;
         std::vector<void*> _solvers;
+        std::vector<void*> _cudnn;
 
         CublasHelper();
         ~CublasHelper();
     public:
         static CublasHelper* getInstance();
 
+        void* cudnn();
         void* solver();
 
         void* handle();
diff --git a/libnd4j/include/helpers/cuda_off/cublasHelper.cu b/libnd4j/include/helpers/cuda_off/cublasHelper.cu
index d9784eaa2..7204862eb 100644
--- a/libnd4j/include/helpers/cuda_off/cublasHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/cublasHelper.cu
@@ -25,6 +25,13 @@
 #include <exceptions/cuda_exception.h>
 #include <helpers/logger.h>
 #include <execution/AffinityManager.h>
+#include "config.h"
+
+#ifdef HAVE_CUDNN
+
+#include <cudnn.h>
+
+#endif
 
 namespace nd4j {
     std::mutex CublasHelper::_mutex;
@@ -47,6 +54,18 @@ namespace nd4j {
         return cusolverH;
     }
 
+    static void* cudnn_() {
+#ifdef HAVE_CUDNN
+        auto cudnnH = new cudnnHandle_t();
+        auto status = cudnnCreate(cudnnH);
+        if (status != CUDNN_STATUS_SUCCESS)
+            throw cuda_exception::build("cuDNN handle creation failed !", status);
+
+        return cudnnH;
+#endif
+        return nullptr;
+    }
+
     static void destroyHandle_(void* handle) {
         auto ch = reinterpret_cast<cublasHandle_t *>(handle);
         auto status = cublasDestroy_v2(*ch);
@@ -62,11 +81,13 @@ namespace nd4j {
         auto currentDevice = AffinityManager::currentDeviceId();
         _cache.resize(numDevices);
         _solvers.resize(numDevices);
+        _cudnn.resize(numDevices);
         for (int e = 0; e < numDevices; e++) {
             AffinityManager::setCurrentNativeDevice(e);
 
             _cache[e] = handle_();
             _solvers[e] = solver_();
+            _cudnn[e] = cudnn_();
         }
 
         // don't forget to restore back original device
@@ -90,6 +111,14 @@ namespace nd4j {
         return _INSTANCE;
     }
 
+    void* CublasHelper::cudnn() {
+        auto deviceId = AffinityManager::currentDeviceId();
+        if (deviceId < 0 || deviceId > _cudnn.size())
+            throw cuda_exception::build("requested deviceId doesn't look valid", deviceId);
+
+        return _cudnn[deviceId];
+    }
+
     void* CublasHelper::handle() {
         auto deviceId = AffinityManager::currentDeviceId();
         return handle(deviceId);
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index 1bd5d10cb..aeb2d9d36 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -35,12 +35,12 @@ static __global__ void simpleIndexReduceGeneric(const int op,
                                            Nd4jLong *xShapeInfo, int xRank,
                                            void *extraParams,
                                            void *result,
-                                           Nd4jLong *resultShapeInfo, int zRank,
+                                           Nd4jLong *zShapeInfo, int zRank,
                                            int *dimension,
                                            int dimensionLength,
                                            int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
 
-     functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,resultShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
+     functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,zShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
 }
 
 namespace functions {
@@ -52,7 +52,7 @@ namespace functions {
                                                                 void *dx, Nd4jLong *xShapeInfo,
                                                                 int xRank,
                                                                 void *extraParams,
-                                                                void *result, Nd4jLong *resultShapeInfo,
+                                                                void *result, Nd4jLong *zShapeInfo,
                                                                 int zRank,
                                                                 int *dimension, int dimensionLength,
                                                                 int postProcessOrNot,
@@ -62,7 +62,7 @@ namespace functions {
             simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(opNum,
                                                                                             dx, xShapeInfo, xRank,
                                                                                             extraParams,
-                                                                                            result, resultShapeInfo, 0,
+                                                                                            result, zShapeInfo, 0,
                                                                                             nullptr, 0,
                                                                                             1,
                                                                                             allocationBuffer, reductionBuffer,
@@ -70,14 +70,14 @@ namespace functions {
         }
 
         template <typename X, typename Z>
-        _CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
+        _CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
             simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(
 			 opNum,
 			 dx,
 			 xShapeInfo, xRank,
 			 extraParams,
 			 result,
-			 resultShapeInfo, zRank,
+			 zShapeInfo, zRank,
 			 dimension,
 			 dimensionLength,
 			 1, allocationBuffer, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
@@ -158,7 +158,7 @@ namespace functions {
                 Nd4jLong *xShapeInfo,
                 void *extraParams,
                 void *result,
-                Nd4jLong *resultShapeInfo,
+                Nd4jLong *zShapeInfo,
                 int *dimension,
                 int dimensionLength,
                 int postProcessOrNot,
@@ -166,7 +166,7 @@ namespace functions {
                 void *reductionBuffer,
                 Nd4jLong *tadShapeInfo,
                 Nd4jLong *tadOffset) {
-             DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, resultShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
+             DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
         }
 
 
@@ -174,7 +174,7 @@ namespace functions {
         template <typename OpType>
         __device__ void IndexReduce<X, Z>::transform(void *vdx, Nd4jLong *xShapeInfo,
                                                 void *vextraParams,
-                                                void *vresult, Nd4jLong *resultShapeInfo,
+                                                void *vz, Nd4jLong *zShapeInfo,
                                                 int *dimension, int dimensionLength,
                                                 int postProcessOrNot,
                                                 int *allocationBuffer, void *vreductionBuffer,
@@ -183,7 +183,7 @@ namespace functions {
              * Gpu information for the problem
              */
             auto dx = reinterpret_cast<X*>(vdx);
-            auto result = reinterpret_cast<Z*>(vresult);
+            auto z = reinterpret_cast<Z*>(vz);
             auto extraParams = static_cast<X*>(vextraParams);
             auto reductionBuffer = static_cast<X*>(vreductionBuffer);
             auto order = shape::order(xShapeInfo);
@@ -203,19 +203,19 @@ namespace functions {
             //length for the tad
             __shared__ volatile Nd4jLong xLength;
 
-            __shared__ volatile Nd4jLong resultLength;
+            __shared__ volatile Nd4jLong zLen;
 
 
             //only compute the tad indexes once
             IndexValue<X> reduction = OpType::startingIndexValue(dx);
 
             if (threadIdx.x == 0) {
-                if (resultShapeInfo != nullptr)
-                    resultLength = shape::length(resultShapeInfo);
-                else resultLength = 1;
+                if (zShapeInfo != nullptr)
+                    zLen = shape::length(zShapeInfo);
+                else zLen = 1;
 
                 if (dimensionLength == 1) {
-                    if (resultLength == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
+                    if (zLen == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
                         resultScalar = 1;
                     else
                         resultScalar = 0;
@@ -223,13 +223,24 @@ namespace functions {
                 else
                     resultScalar = 0;
 
-                if (resultLength == 1)
+                if (zLen == 1)
                     resultScalar = 1;
 
                 xLength = shape::length(xShapeInfo);
             }
             __syncthreads();
 
+            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
+
+                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                    return;
+
+                for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x)
+                    z[i] = (Z) reduction.index;
+
+                return;
+            }
+
             if (!resultScalar) {
 
                 __shared__ Nd4jLong tadLength;
@@ -261,7 +272,7 @@ namespace functions {
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
-                            result[r] = (Z) sPartials[threadIdx.x].index;
+                            z[r] = (Z) sPartials[threadIdx.x].index;
                         }
                         __syncthreads();
                     }
@@ -282,7 +293,7 @@ namespace functions {
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
-                            result[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
+                            z[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
                         }
                         __syncthreads();
                     }
@@ -345,14 +356,14 @@ namespace functions {
 
                         __syncthreads();
                         if (tid == 0) {
-                            result[0] = (Z) sPartials[0].index;
+                            z[0] = (Z) sPartials[0].index;
                         }
                     }
                 } else {
                     if (tid == 0) {
                         auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
                         tc[16384] = 0;
-                        result[0] = (Z) sPartials[0].index;
+                        z[0] = (Z) sPartials[0].index;
                     }
                 }
 
diff --git a/libnd4j/include/memory/cuda/Workspace.cu b/libnd4j/include/memory/cuda/Workspace.cu
index 18b5ebf3b..aeb6b4752 100644
--- a/libnd4j/include/memory/cuda/Workspace.cu
+++ b/libnd4j/include/memory/cuda/Workspace.cu
@@ -143,7 +143,7 @@ namespace nd4j {
                 cudaFreeHost((void *)this->_ptrHost);
 
             if (this->_allocatedDevice && !_externalized)
-                cudaFree((void *)this->_ptrHost);
+                cudaFree((void *)this->_ptrDevice);
 
             freeSpills();
         }
diff --git a/libnd4j/include/ops/declarable/OpRegistrator.h b/libnd4j/include/ops/declarable/OpRegistrator.h
index effb71c67..ccec8a3c6 100644
--- a/libnd4j/include/ops/declarable/OpRegistrator.h
+++ b/libnd4j/include/ops/declarable/OpRegistrator.h
@@ -27,6 +27,7 @@
 #include <mutex>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/PlatformHelper.h>
+#include <execution/Engine.h>
 
 // handlers part
 #include <cstdlib>
@@ -66,8 +67,8 @@ namespace nd4j {
             std::vector<nd4j::ops::DeclarableOp *> _uniqueD;
 
             // pointers to platform-specific helpers
-            std::map<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
-            std::map<std::string, nd4j::ops::platforms::PlatformHelper*> _helpersH;
+            std::map<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
+            std::map<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersH;
             std::vector<nd4j::ops::platforms::PlatformHelper*> _uniqueH;
 
             std::mutex _locker;
@@ -98,13 +99,13 @@ namespace nd4j {
 
             void registerHelper(nd4j::ops::platforms::PlatformHelper* op);
 
-            bool hasHelper(Nd4jLong hash);
+            bool hasHelper(Nd4jLong hash, samediff::Engine engine);
 
             nd4j::ops::DeclarableOp* getOperation(const char *name);
             nd4j::ops::DeclarableOp* getOperation(Nd4jLong hash);
             nd4j::ops::DeclarableOp* getOperation(std::string &name);
 
-            nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash);
+            nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash, samediff::Engine engine);
 
             std::vector<Nd4jLong> getAllHashes();
 
diff --git a/libnd4j/include/ops/declarable/PlatformHelper.h b/libnd4j/include/ops/declarable/PlatformHelper.h
index 6fbbae3b8..afa0107fc 100644
--- a/libnd4j/include/ops/declarable/PlatformHelper.h
+++ b/libnd4j/include/ops/declarable/PlatformHelper.h
@@ -22,6 +22,7 @@
 #define SD_PLATFORMHELPER_H
 
 #include <ShapeUtils.h>
+#include <execution/Engine.h>
 #include <graph/Context.h>
 #include <string>
 #include <pointercast.h>
@@ -35,18 +36,23 @@ namespace  nd4j {
              */
             class ND4J_EXPORT PlatformHelper {
             protected:
+                // target engine for this impl
+                samediff::Engine _engine;
+
                 // name of the operation this helper is built for
                 std::string _name;
 
                 // hash of the operation this helper is built for
                 Nd4jLong _hash;
             public:
-                PlatformHelper(const char *name);
+                PlatformHelper(const char *name, samediff::Engine engine);
 
                 ~PlatformHelper() = default;
 
                 std::string name();
 
+                samediff::Engine engine();
+
                 Nd4jLong hash();
 
                 /**
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
index a4fd7f7c3..0652f1840 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@@ -199,16 +199,16 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
     int dH = INT_ARG(10);                                                       // dilations height
     int dW = INT_ARG(11);                                                       // dilations width
     int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
-    int isNDHWC  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
 
     int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
     int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
 
     int trueoD, trueoH, trueoW;          // true output depth/height/width
     ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);
 
-    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D_BP OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
     std::string expectedGradOShape   = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2}));
     std::string expectedWeightsShape = ShapeUtils::shapeAsString({kD, kH, kW, iC, oC});
     REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0,  "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str());
@@ -222,7 +222,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
 
     std::vector<int> gradOaxesForDot;
 
-    if(!isNDHWC) {
+    if(!isNCDHW) {
         gradOaxesForDot  = {0,1,2,3};                                           // bS, oD, oH, oW
         input = new NDArray(input->permute({0,4,1,2,3}));                       // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
         gradI = new NDArray(gradI->permute({0,4,1,2,3}));                       // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
@@ -249,7 +249,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
     MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2,3,4,1,0,5,6,7});   // [kD, kH, kW, iC, oC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
     ConvolutionUtils::col2vol(block, columns, *gradI, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // columns [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to  [bS, iC, iD, iH, iW]
 
-    if(!isNDHWC) {
+    if(!isNCDHW) {
         delete input;
         delete gradI;
     }
@@ -287,7 +287,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
     int dH = INT_ARG(10);                                                       // dilations height
     int dW = INT_ARG(11);                                                       // dilations width
     int paddingMode = INT_ARG(12);                                               // 1-SAME,  0-VALID
-    int isNDHWC  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
 
     const int rank = 5;
     REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
@@ -296,7 +296,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
     REQUIRE_TRUE(gradOShapeInfo[0]   == rank, 0, "CUSTOM CONV3D_BP OP: rank of output gradients (next epsilon) array must be equal to %i, but got %i instead !", rank, gradOShapeInfo);
 
     int indIOioC, indIiD, indWoC(4);
-    if(!isNDHWC) {
+    if(!isNCDHW) {
         indIOioC = 4; indIiD = 1;
     }
     else {
diff --git a/libnd4j/include/ops/declarable/helpers/convolutions.h b/libnd4j/include/ops/declarable/helpers/convolutions.h
index 68b39cfd5..e8bf735bc 100644
--- a/libnd4j/include/ops/declarable/helpers/convolutions.h
+++ b/libnd4j/include/ops/declarable/helpers/convolutions.h
@@ -41,8 +41,10 @@ namespace nd4j {
             static inline void calcOutSizePool2D(int& oH, int& oW, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int iH, const int iW, const int paddingMode) {
 
                 if(paddingMode == 0) {             // valid
-                    oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
-                    oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
+                    // oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
+                    // oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
+                    oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
+                    oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
                 }
                 else if (paddingMode == 1) {       // same
                     oH = (int) math::nd4j_ceil<double, double>(iH * 1. / sH);
@@ -57,9 +59,9 @@ namespace nd4j {
             static inline void calcOutSizePool3D(int& oD, int& oH, int& oW, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int iD, const int iH, const int iW, const int paddingMode) {
 
                 if(paddingMode == 0) {             // valid
-                    oD = (iD - (kD + (kD - 1) * (dD - 1)) + 2 * pD) / sD + 1;
-                    oH = (iH - (kH + (kH - 1) * (dH - 1)) + 2 * pH) / sH + 1;
-                    oW = (iW - (kW + (kW - 1) * (dW - 1)) + 2 * pW) / sW + 1;
+                    oD = (iD - ((kD - 1) * dD + 1) + 2 * pD) / sD + 1;
+                    oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
+                    oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
                 }
                 else if(paddingMode == 1) {        // same
                     oD = (int) nd4j::math::nd4j_ceil<double, double>(iD * 1. / sD);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
index b8cd35261..ffa741509 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@@ -1121,8 +1121,12 @@ namespace helpers {
         I const* cropSizes = reinterpret_cast<I const*>(cropSize->getSpecialBuffer());
         T* outBuf = reinterpret_cast<T*>(crops->specialBuffer());
 
+        int threadsPerBlock = math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth);
+        if(threadsPerBlock > MAX_NUM_THREADS/4)
+            threadsPerBlock = MAX_NUM_THREADS/4;
+
         NDArray::prepareSpecialUse({crops}, {images, boxes, indices, cropSize});
-        cropAndResizeKernel<T,Z,I><<<batchSize, math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth), 512, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
+        cropAndResizeKernel<T,Z,I><<<batchSize, threadsPerBlock, 256, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
                 cropSizes, cropSize->getSpecialShapeInfo(), method, extrapolationVal, outBuf, crops->specialShapeInfo(), numBoxes, cropHeight, cropWidth, batchSize, imageHeight, imageWidth, depth);
         NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize});
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
index 9b749c6e2..ea9901f0a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
@@ -30,7 +30,7 @@ namespace helpers {
 //////////////////////////////////////////////////////////////////////////
 // calculate digamma function for array elements
 template <typename T>
-static void lgamma_(NDArray& x, NDArray& z) {
+void lgamma_(NDArray& x, NDArray& z) {
     //auto dtype = x.dataType();
     auto lgammaProc = LAMBDA_T(x_, dtype) {
         return T(DataTypeUtils::fromT<T>() == DataType::DOUBLE?::lgamma(x_): ::lgammaf(x_)); //math::nd4j_log<T,T>(math::nd4j_gamma<T,T>(x));
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
index a7df3cbea..8d5cb90d4 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
@@ -535,8 +535,8 @@ namespace nd4j {
             // platform helpers use might be forbidden for various reasons, so we'll check it out first
             if (block->helpersAllowed() && nd4j::Environment::getInstance()->helpersAllowed()) {
                 // if we have platform-specific helper for this op - invoke it
-                if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) {
-                    auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash());
+                if (OpRegistrator::getInstance()->hasHelper(this->getOpHash(), block->engine())) {
+                    auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash(), block->engine());
                     if (helper->isUsable(*block)) {
                         status = helper->invokeHelper(*block);
                         hasHelper = true;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
index 5622f4316..b1261b37c 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
@@ -69,9 +69,9 @@ namespace nd4j {
             } else if (block.getTArguments()->size() > 0) {
                 auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
 
-                NDArray::prepareSpecialUse({z}, {x, &y});
-
-                NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
+                x->applyScalarArr(static_cast<nd4j::scalar::Ops>(opNum), y, *z);
+                // NDArray::prepareSpecialUse({z}, {x, &y});
+                // NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
 
                 manager.synchronize();
             } else {
diff --git a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
index a42203162..09e4ec58f 100644
--- a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
@@ -173,15 +173,18 @@ namespace nd4j {
         }
 
         void OpRegistrator::registerHelper(nd4j::ops::platforms::PlatformHelper* op) {
-            if (_helpersLH.count(op->hash()) > 0)
+            std::pair<Nd4jLong, samediff::Engine> p = {op->hash(), op->engine()};
+            if (_helpersLH.count(p) > 0)
                 throw std::runtime_error("Tried to double register PlatformHelper");
 
             _uniqueH.emplace_back(op);
 
-            std::pair<std::string, nd4j::ops::platforms::PlatformHelper*> pair(op->name(), op);
+            nd4j_debug("Adding helper for op \"%s\": [%lld - %i]\n", op->name().c_str(), op->hash(), (int) op->engine());
+
+            std::pair<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair({op->name(), op->engine()}, op);
             _helpersH.insert(pair);
 
-            std::pair<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> pair2(op->hash(), op);
+            std::pair<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair2(p, op);
             _helpersLH.insert(pair2);
         }
 
@@ -227,15 +230,17 @@ namespace nd4j {
             return _declarablesD.at(name);
         }
 
-        nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash) {
-            if (_helpersLH.count(hash) == 0)
+        nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash, samediff::Engine engine) {
+            std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
+            if (_helpersLH.count(p) == 0)
                 throw std::runtime_error("Requested helper can't be found");
 
-            return _helpersLH[hash];
+            return _helpersLH[p];
         }
 
-        bool OpRegistrator::hasHelper(Nd4jLong hash) {
-            return _helpersLH.count(hash) > 0;
+        bool OpRegistrator::hasHelper(Nd4jLong hash, samediff::Engine engine) {
+            std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
+            return _helpersLH.count(p) > 0;
         }
 
         int OpRegistrator::numberOfOperations() {
diff --git a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
index 75dc6e2c4..86c84b0fb 100644
--- a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
+++ b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
@@ -24,10 +24,11 @@
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PlatformHelper::PlatformHelper(const char *name) {
+            PlatformHelper::PlatformHelper(const char *name, samediff::Engine engine) {
                 // we just store name/hash of target operation
                 _name = std::string(name);
                 _hash = HashHelper::getInstance()->getLongHash(_name);
+                _engine = engine;
             }
 
             nd4j::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) {
@@ -74,6 +75,10 @@ namespace nd4j {
                 return z;
             }
 
+            samediff::Engine PlatformHelper::engine() {
+                return _engine;
+            }
+
             std::string PlatformHelper::name() {
                 return _name;
             }
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu b/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
new file mode 100644
index 000000000..3bd1357bf
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
@@ -0,0 +1,275 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void batchnormCUDNN(const LaunchContext* context,
+                            const NDArray* input, const NDArray* mean, const NDArray* variance,
+                            const NDArray* gamma, const NDArray* beta,
+                                  NDArray* output,
+                            const double epsilon, const bool isSpatialMode) {
+
+
+    // input, output -> 4D:nchw, 5D:ncdhw
+    // mean, variance, gamma, beta -> 1xCx1x1 for 4D and 1xCx1x1x1 for 5D for BATCHNORM_MODE_SPATIAL mode
+    //                             -> 1xCxHxW for 4D and 1xCxDxHxW for 5D for BATCHNORM_MODE_PER_ACTIVATION mode
+
+    const cudnnDataType_t dataType = cudnnDataType(input->dataType());
+
+    const int xRank = input->rankOf();
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> xShape = input->getShapeAsVectorInt();               // input and output have same shapes
+
+    std::vector<int> paramsShape, paramsStrides;                                 // mean, variance, gamma and beta have same shapes
+    if(isSpatialMode) { // 1xCx1x1
+        const int iC = mean->lengthOf();
+        const int stride0 = mean->strideAt(0);
+        paramsShape   = xRank == 4 ? std::vector<int>({1, iC, 1, 1}) : std::vector<int>({1, iC, 1, 1, 1});
+        paramsStrides = xRank == 4 ? std::vector<int>({iC*stride0, stride0, 1, 1}) : std::vector<int>({iC*stride0, stride0, 1, 1, 1});
+    }
+    else {
+        paramsShape = mean->getShapeAsVectorInt();
+        paramsStrides = xRank == 4 ? std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3)}) : std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3), (int)mean->strideAt(4)});
+    }
+
+    std::vector<int> xStrides = {(int)input->strideAt(0),  (int)input->strideAt(1),  (int)input->strideAt(2),  (int)input->strideAt(3)};
+    std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3)};
+
+    if(xRank > 4) { // 5D
+        xStrides.push_back((int)input->strideAt(4));
+        zStrides.push_back((int)output->strideAt(4));
+    }
+
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+
+     // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(z, format, dataType, xRank, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(z, dataType, xRank, xShape.data(), zStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+
+    // mean, variance, gamma and beta descriptor, the same descriptor for all of them
+    cudnnTensorDescriptor_t params;
+    cudnnCreateTensorDescriptor(&params);
+    if(mean->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
+
+
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* ptrAlpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* ptrBeta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, mean, variance, gamma, beta});
+
+    // calculations
+    err = cudnnBatchNormalizationForwardInference(*handle, isSpatialMode ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION,
+                                                 ptrAlpha, ptrBeta,
+                                                 x, input->getSpecialBuffer(),
+                                                 z, output->getSpecialBuffer(),
+                                                 params,
+                                                 gamma ? gamma->getSpecialBuffer(): nullptr,
+                                                 beta  ? beta->getSpecialBuffer() : nullptr,
+                                                 mean->getSpecialBuffer(), variance->getSpecialBuffer(), epsilon);
+
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("batchnormCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+
+    NDArray::registerSpecialUse({output}, {input, mean, variance, gamma, beta});
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(batchnorm, ENGINE_CUDA) {
+
+    auto input    = INPUT_VARIABLE(0);
+    auto mean     = INPUT_VARIABLE(1);
+    auto variance = INPUT_VARIABLE(2);
+    NDArray* gamma    = nullptr;
+    NDArray* beta     = nullptr;
+
+    auto output   = OUTPUT_VARIABLE(0);
+
+    const bool   applyScale  = (bool)INT_ARG(0);
+    const bool   applyOffset = (bool)INT_ARG(1);
+    const double epsilon     = T_ARG(0);
+
+    if(applyScale)
+        gamma = INPUT_VARIABLE(3);
+    if(applyOffset)
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
+
+    const int numOfIntArgs = block.getIArguments()->size();
+    const int inRank = input->rankOf();
+
+    // get axes args to normalize input array over
+    std::vector<int> axes;
+    if(numOfIntArgs > 2)
+        for(int i = 2; i < numOfIntArgs; ++i)
+            axes.push_back(INT_ARG(i));
+    else
+        axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
+
+    const int numOfAxes = axes.size();
+    REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM CUDNN op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
+
+    // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
+    // for example if input shape is {2,3,4,5,6} and axes = {1,3}, then expected shape would be {1,3,1,5,1}, and if axes = {3}, then expected shape would be {5}
+    std::vector<Nd4jLong> expShape;
+    if(numOfAxes == 1)
+        expShape.push_back(input->sizeAt(axes[0]));
+    else {      // get, for example, something like {1, inputDim1, 1, inputDim3, 1} if axes = {1, 3}
+        expShape = std::vector<Nd4jLong>(inRank, 1);
+        for(uint i = 0; i < numOfAxes; ++i)
+            expShape[axes[i]] = input->sizeAt(axes[i]);
+    }
+
+    REQUIRE_TRUE(mean->isSameShape(expShape) , 0, "BATCHNORM CUDNN op: wrong shape of mean array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(mean).c_str());
+    REQUIRE_TRUE(variance->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of variance array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(variance).c_str());
+    if(gamma)
+        REQUIRE_TRUE(gamma->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of gamma array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(gamma).c_str());
+    if(beta)
+        REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
+
+    // types of all input arrays should be the same
+    for(int i = 1; i < block.width(); ++i)
+        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM CUDNN op: types of all input arrays should be the same !");
+
+    // cudnn supports NCHW format only
+    const bool needPermut = axes.size() == 1 && mean->lengthOf() == input->sizeAt(-1);
+
+    if(needPermut) {    // if NHWC
+        std::vector<int> perm = {0, 3, 1, 2};           // NHWC -> NCHW
+        input  = new NDArray(input->permute(perm));
+        output = new NDArray(output->permute(perm));
+    }
+
+    // calculations
+    batchnormCUDNN(block.launchContext(), input, mean, variance, gamma, beta, output, epsilon, axes.size() == 1);
+
+    if(needPermut) {
+        delete input;
+        delete output;
+    }
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(batchnorm, ENGINE_CUDA) {
+
+    const bool   applyScale  = (bool)INT_ARG(0);
+    const bool   applyOffset = (bool)INT_ARG(1);
+
+    NDArray* input     = INPUT_VARIABLE(0);
+    NDArray* mean      = INPUT_VARIABLE(1);
+    NDArray* variance  = INPUT_VARIABLE(2);
+    NDArray* gamma     = applyScale  ? INPUT_VARIABLE(3) : nullptr;
+    NDArray* beta      = applyOffset ? INPUT_VARIABLE(3 + (int)applyScale) : nullptr;
+
+    const int numOfIntArgs = block.getIArguments()->size();
+    const int xRank = input->rankOf();
+
+    // disable cudnn batchnorm so far
+    return false;
+
+    // *********************************** //
+    if(xRank != 4 && xRank != 5)
+        return false;
+
+    // *********************************** //
+    const bool badType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
+    if(badType)
+        return false;
+
+    // *********************************** //
+    // get axes args to normalize input array over
+    std::vector<int> axes;
+    if(numOfIntArgs > 2)
+        for(int i = 2; i < numOfIntArgs; ++i)
+            axes.push_back(INT_ARG(i));
+    else
+        axes.push_back(xRank-1);               // default dimension to reduce along is last dimension
+
+    if(axes.size() != 1 && axes.size() != 3 && axes.size() != 4)
+        return false;
+
+    // *********************************** //
+    bool allParamsHaveSameShapeAndStrides = shape::haveSameShapeAndStrides(mean->getShapeInfo(), variance->getShapeInfo());
+    if(gamma)
+        allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), gamma->getShapeInfo());
+    if(beta)
+        allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), beta->getShapeInfo());
+
+    if(!allParamsHaveSameShapeAndStrides)
+        return false;
+
+    // *********************************** //
+    bool isFormatGood = false;
+    if(axes.size() == 1)
+        isFormatGood = mean->lengthOf() == input->sizeAt(1) || mean->lengthOf() == input->sizeAt(-1);   // mean [C]
+    else {
+        auto inputShapeModif = input->getShapeAsVector();     // [dim0,dim1,dim2,dim3] 4D or [dim0,dim1,dim2,dim3,dim4]
+        inputShapeModif[0] = 1;
+        isFormatGood = mean->isSameShape(inputShapeModif);    // mean [1,dim1,dim2,dim3] 4D or [1,dim1,dim2,dim3,dim4]
+    }
+    if(!isFormatGood)
+        return false;
+
+    return true;
+}
+
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
new file mode 100644
index 000000000..234dbffb7
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
@@ -0,0 +1,521 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void conv2dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kH, const int kW,
+                        const int sH, const int sW,
+                        const int pH, const int pW,
+                        const int dH, const int dW,
+                        const int paddingMode, const bool isNCHW) {
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void conv2dBpCUDNN(const LaunchContext* context,
+                          const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                          NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                          const int kH, const int kW,
+                          const int sH, const int sW,
+                          const int pH, const int pW,
+                          const int dH, const int dW,
+                          const int paddingMode, const bool isNCHW) {
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    bool isNCHW    = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong>  expectedWeightsShape = {kH, kW, iC, oC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if (bias) {
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+        REQUIRE_TRUE((bias->rankOf() == 1 && bias->strideAt(0) == 1) || (bias->rankOf() == 2 && bias->sizeAt(0) == 1 && bias->strideAt(1) == 1) || (bias->rankOf() == 2 && bias->sizeAt(1) == 1 && bias->strideAt(0) == 1), 0, "CUSTOM CONV2D CUDNN OP: bias array should be contiguous in memory !");
+    }
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC  --> oC, iC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    conv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    const int paddingMode = INT_ARG(8);                                  // 0-VALID, 1-SAME, 2-CAUSAL
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, oC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    int kH = INT_ARG(0);                                                        // filter(kernel) height
+    int kW = INT_ARG(1);                                                        // filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of output's gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    int trueoH, trueoW;          // true output height, width
+    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {oC, iC, kH, kW}, gradW->dataType(),   gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC  --> oC, iC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    conv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
+
+    newGradW->permutei({2,3,1,0});  // [oC, iC, kH, kW] -> [kH, kW, iC, oC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                           // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                           // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;             // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+    const int paddingMode = INT_ARG(8);                                             // 0-VALID, 1-SAME, 2-CAUSAL
+    const int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;    // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+
+
+
+
+
+
+// PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
+
+//     auto handle = reinterpret_cast<cudnnHandle_t *>(block.launchContext()->getCuDnnHandle());
+//     auto res = cudnnSetStream(*handle, *block.launchContext()->getCudaStream());
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("Can't set stream for cuDNN", res);
+
+//     auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+//     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+//     auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+//     auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+//     NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+//     int sH = INT_ARG(2);                                                        // strides height
+//     int sW = INT_ARG(3);                                                        // strides width
+//     int pH = INT_ARG(4);                                                        // paddings height
+//     int pW = INT_ARG(5);                                                        // paddings width
+//     int dH = INT_ARG(6);                                                        // dilations height
+//     int dW = INT_ARG(7);                                                        // dilations width
+//     int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
+//     bool isNCHW    = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+
+//     int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
+//     int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
+
+//     int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+//     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+//     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+//     ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, isSameMode);
+
+//     auto dtype = cudnnDataType(input->dataType());
+
+
+//     cudnnTensorDescriptor_t src;
+//     cudnnCreateTensorDescriptor(&src);
+//     res = cudnnSetTensor4dDescriptorEx(src, dtype, input->sizeAt(0), input->sizeAt(1), input->sizeAt(2), input->sizeAt(3), input->strideAt(0), input->strideAt(1), input->strideAt(2), input->strideAt(3));
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx src failed", res);
+
+//     // TODO: we definitely want NHWC here as well
+//     cudnnFilterDescriptor_t wght;
+//     cudnnCreateFilterDescriptor(&wght);
+//     res = cudnnSetFilter4dDescriptor(wght, dtype, CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetFilter4dDescriptor failed", res);
+
+//     cudnnConvolutionDescriptor_t cdc;
+//     cudnnCreateConvolutionDescriptor(&cdc);
+//     res = cudnnSetConvolution2dDescriptor(cdc, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, dtype);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetConvolution2dDescriptor failed", res);
+
+//     cudnnTensorDescriptor_t dst;
+//     cudnnCreateTensorDescriptor(&dst);
+//     res = cudnnSetTensor4dDescriptorEx(dst, dtype, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3), output->strideAt(0), output->strideAt(1), output->strideAt(2), output->strideAt(3));
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx dst failed", res);
+
+//     // TODO: workspace algorithms are supposed to be faster, so we should use it here if we have enough memory
+//     cudnnConvolutionFwdAlgo_t algo;
+//     res = cudnnGetConvolutionForwardAlgorithm(*handle, src, wght, cdc, dst, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnGetConvolutionForwardAlgorithm failed", res);
+
+//     // TODO: should be float if dtype is half/float, and double otherwise
+//     float alpha = 1.0f;
+//     float beta = 0.0f;
+//     res = cudnnConvolutionForward(*handle, &alpha, src, input->specialBuffer(), wght, weights->specialBuffer(), cdc, algo, nullptr, 0, &beta, dst, output->specialBuffer());
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnConvolutionForward failed", res);
+
+
+//     if (bias != nullptr) {
+//         cudnnTensorDescriptor_t bs;
+//         cudnnCreateTensorDescriptor(&bs);
+//         if (isNCHW) {
+//             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NCHW, dtype, 1, bias->lengthOf(), 1, 1);
+//             if (res != 0)
+//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//         } else {
+//             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NHWC, dtype, 1, 1, 1, bias->lengthOf());
+//             if (res != 0)
+//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//         }
+
+//         res = cudnnAddTensor(*handle, &alpha, bs, bias->specialBuffer(), &alpha, dst, output->specialBuffer());
+//         if (res != 0)
+//             throw nd4j::cuda_exception::build("cudnnAddTensor failed", res);
+//     }
+
+
+//     NDArray::registerSpecialUse({output}, {input, weights, bias});
+
+//     return Status::OK();
+// }
+
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
new file mode 100644
index 000000000..9d30ff04c
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
@@ -0,0 +1,453 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void conv3dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kD, const int kH, const int kW,
+                        const int sD, const int sH, const int sW,
+                        const int pD, const int pH, const int pW,
+                        const int dD, const int dH, const int dW,
+                        const int paddingMode, const bool isNCDHW) {
+
+    const int numDims = 5;
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> pads        = {pD, pH, pW};
+    const std::vector<int> filtStrides = {sD, sH, sW};
+    const std::vector<int> dilations   = {dD, dH, dW};
+
+    const std::vector<int> xShape   = {bS, iC, iD, iH, iW};
+    const std::vector<int> zShape   = {bS, oC, oD, oH, oW};
+    const std::vector<int> wShape   = {oC, iC, kD, kH, kW};
+    const std::vector<int> bShape   = {1, (isNCDHW ? oC : 1), 1, 1, (isNCDHW ? 1 : oC)};
+
+    const std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
+    const std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3), (int)output->strideAt(4)};
+
+    cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilterNdDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
+    if(err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetFilterNdDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(z, format, cudnnDataType(output->dataType()), numDims, zShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(z, cudnnDataType(output->dataType()), numDims, zShape.data(), zStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensorNdDescriptorEx(b, format, cudnnDataType(bias->dataType()), numDims, bShape.data());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv3dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void conv3dBpCUDNN(const LaunchContext* context,
+                          const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                          NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                          const int kD, const int kH, const int kW,
+                          const int sD, const int sH, const int sW,
+                          const int pD, const int pH, const int pW,
+                          const int dD, const int dH, const int dW,
+                          const int paddingMode, const bool isNCDHW) {
+
+    const int numDims = 5;
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> pads        = {pD, pH, pW};
+    const std::vector<int> filtStrides = {sD, sH, sW};
+    const std::vector<int> dilations   = {dD, dH, dW};
+
+    const std::vector<int> xShape  = {bS, iC, iD, iH, iW};
+    const std::vector<int> dzShape = {bS, oC, oD, oH, oW};
+    const std::vector<int> wShape  = {oC, iC, kD, kH, kW};
+    const std::vector<int> dbShape = {1, (int)(isNCDHW ? oC : 1), 1, 1, (int)(isNCDHW ? 1 : oC)};
+
+    const std::vector<int> xStrides  = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
+    const std::vector<int> dxStrides = {(int)gradI->strideAt(0), (int)gradI->strideAt(1), (int)gradI->strideAt(2), (int)gradI->strideAt(3), (int)gradI->strideAt(4)};
+    const std::vector<int> dzStrides = {(int)gradO->strideAt(0), (int)gradO->strideAt(1), (int)gradO->strideAt(2), (int)gradO->strideAt(3), (int)gradO->strideAt(4)};
+
+    cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(dz, format, cudnnDataType(gradO->dataType()), numDims, dzShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(dz, cudnnDataType(gradO->dataType()), numDims, dzShape.data(), dzStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(dx, format, cudnnDataType(gradI->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(dx, cudnnDataType(gradI->dataType()), numDims, xShape.data(), dxStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilterNdDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
+    if(err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetFilterNdDescriptor failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensorNdDescriptorEx(db, format, cudnnDataType(gradB->dataType()), numDims, dbShape.data());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv3dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv3dnew, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)
+
+    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CONV3D CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
+
+    int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
+    int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
+    int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
+    int sD = INT_ARG(3);                                                        // strides depth
+    int sH = INT_ARG(4);                                                        // strides height
+    int sW = INT_ARG(5);                                                        // strides width
+    int pD = INT_ARG(6);                                                        // paddings depth
+    int pH = INT_ARG(7);                                                        // paddings height
+    int pW = INT_ARG(8);                                                        // paddings width
+    int dD = INT_ARG(9);                                                        // dilations depth
+    int dH = INT_ARG(10);                                                       // dilations height
+    int dW = INT_ARG(11);                                                       // dilations width
+    int paddingMode = INT_ARG(12);                                              // 0-SAME,  1-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CONV3D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if (bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC  --> oC, iC, kD, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
+
+    conv3dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW, paddingMode, isNCDHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(conv3dnew, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    int paddingMode = INT_ARG(12);                                       // 0-SAME,  1-VALID
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv3dnew_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, iC, oC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CONV3D_BP CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D_BP CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 5, 0,   "CONV3D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 5, but got %i instead !", gradO->rankOf());
+
+    int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
+    int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
+    int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
+    int sD = INT_ARG(3);                                                        // strides depth
+    int sH = INT_ARG(4);                                                        // strides height
+    int sW = INT_ARG(5);                                                        // strides width
+    int pD = INT_ARG(6);                                                        // paddings depth
+    int pH = INT_ARG(7);                                                        // paddings height
+    int pW = INT_ARG(8);                                                        // paddings width
+    int dD = INT_ARG(9);                                                        // dilations depth
+    int dH = INT_ARG(10);                                                       // dilations height
+    int dW = INT_ARG(11);                                                       // dilations width
+    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    int trueoD, trueoH, trueoW;          // true output depth/height/width
+    ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);
+
+    REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D_BP CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+
+    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
+    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CONV3D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(gradW->isSameShape(expectedWeightsShape), 0, "CONV3D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {oC, iC, kD, kH, kW}, gradW->dataType(),   gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC  --> oC, iC, kD, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
+
+    conv3dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW,paddingMode,isNCDHW);
+
+    newGradW->permutei({2,3,4,1,0});    // [oC, iC, kD, kH, kW] -> [kD, kH, kW, iC, oC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCDHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3),  0,gradI->sizeAt(4)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,gradI->sizeAt(3),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(conv3dnew_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
+
+    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return isNCDHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
new file mode 100644
index 000000000..bdff86e24
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
@@ -0,0 +1,158 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SD_CUDNNUTILS_H
+#define SD_CUDNNUTILS_H
+
+#include <ops/declarable/PlatformHelper.h>
+#include <ops/declarable/OpRegistrator.h>
+#include <platform_boilerplate.h>
+#include <exceptions/cuda_exception.h>
+#include <exceptions/datatype_exception.h>
+#include <dll.h>
+
+#include <cudnn.h>
+
+namespace nd4j {
+namespace ops {
+namespace platforms {
+
+    DECLARE_PLATFORM(conv2d, ENGINE_CUDA);
+    DECLARE_PLATFORM(conv2d_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(conv3dnew, ENGINE_CUDA);
+    DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CUDA);
+    DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(batchnorm, ENGINE_CUDA);
+    DECLARE_PLATFORM(batchnorm_bp, ENGINE_CUDA);
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE cudnnDataType_t cudnnDataType(nd4j::DataType dataType) {
+    switch (dataType) {
+        case nd4j::DataType::FLOAT32:
+            return CUDNN_DATA_FLOAT;
+        case nd4j::DataType::DOUBLE:
+            return CUDNN_DATA_DOUBLE;
+        case nd4j::DataType::HALF:
+            return CUDNN_DATA_HALF;
+        case nd4j::DataType::INT32:
+            return CUDNN_DATA_INT32;
+        case nd4j::DataType::INT8:
+            return CUDNN_DATA_INT8;
+        default:
+            throw datatype_exception::build("Unsupported data type", dataType);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE void checkConv2dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
+                                            const int iH, const int iW,
+                                            const int oH, const int oW,
+                                            const int kH, const int kW,
+                                            const int sH, const int sW,
+                                            const int pH, const int pW,
+                                            const int dH, const int dW,
+                                            const bool isNCHW) {
+
+    const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
+    const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
+
+    const bool isPHasymm = pH != (pHsum - pH);
+    const bool isPWasymm = pW != (pWsum - pW);
+
+    if(!isPHasymm && !isPWasymm)
+        return;
+
+    std::vector<Nd4jLong> newShape = input->getShapeAsVector();
+
+    const int iHposition = isNCHW ? 2 : 1;
+
+    if(isPHasymm)
+        newShape[iHposition] += 1;
+    if(isPWasymm)
+        newShape[iHposition + 1] += 1;
+
+    NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
+
+    if(isNCHW)
+        (*newInput)({0,0,  0,0,  0,input->sizeAt(2),  0,input->sizeAt(3)}).assign(input);
+    else
+        (*newInput)({0,0,  0,input->sizeAt(1),  0,input->sizeAt(2),  0,0}).assign(input);
+
+    input = newInput;
+
+    if(gradI != nullptr)
+        gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE void checkConv3dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
+                                            const int iD, const int iH, const int iW,
+                                            const int oD, const int oH, const int oW,
+                                            const int kD, const int kH, const int kW,
+                                            const int sD, const int sH, const int sW,
+                                            const int pD, const int pH, const int pW,
+                                            const int dD, const int dH, const int dW,
+                                            const bool isNCDHW) {
+
+    const auto pDsum = ((oD - 1) * sD + ((kD - 1) * dD + 1) - iD);
+    const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
+    const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
+
+    const bool isPDasymm = pD != (pDsum - pD);
+    const bool isPHasymm = pH != (pHsum - pH);
+    const bool isPWasymm = pW != (pWsum - pW);
+
+    if(!isPDasymm && !isPHasymm && !isPWasymm)
+        return;
+
+    std::vector<Nd4jLong> newShape = input->getShapeAsVector();
+
+    const int iDposition = isNCDHW ? 2 : 1;
+
+    if(isPDasymm)
+        newShape[iDposition] += 1;
+    if(isPHasymm)
+        newShape[iDposition + 1] += 1;
+    if(isPWasymm)
+        newShape[iDposition + 2] += 1;
+
+    NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
+
+    if(isNCDHW)
+        (*newInput)({0,0,  0,0,  0,input->sizeAt(2),  0,input->sizeAt(3),  0,input->sizeAt(4)}).assign(input);
+    else
+        (*newInput)({0,0,  0,input->sizeAt(1),  0,input->sizeAt(2),  0,input->sizeAt(3),  0,0}).assign(input);
+
+    input = newInput;
+
+    if(gradI != nullptr)
+        gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
+}
+
+}
+}
+}
+
+#endif //SD_CUDNNUTILS_H
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
new file mode 100644
index 000000000..d328fa92b
--- /dev/null
+++ b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
@@ -0,0 +1,443 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+
+//////////////////////////////////////////////////////////////////////////
+static void depthwiseConv2dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kH, const int kW,
+                        const int sH, const int sW,
+                        const int pH, const int pW,
+                        const int dH, const int dW,
+                        const int paddingMode, const bool isNCHW) {
+
+    // cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
+
+    // input [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
+    // weights [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
+    // bias [oC], may be nullptr
+    // output [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
+    // oC = iC*mC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;           // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(1);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolutionGroupCount failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("depthwiseConv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
+                                const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                                NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                                const int kH, const int kW,
+                                const int sH, const int sW,
+                                const int pH, const int pW,
+                                const int dH, const int dW,
+                                const int paddingMode, const bool isNCHW) {
+
+    // cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
+
+    // input, gradI [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
+    // weights, gradW [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
+    // gradB [oC], may be nullptr
+    // gradO [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
+    // oC = iC*mC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;           // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(1);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolutionGroupCount failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("depthwiseConv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(depthwise_conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC] = iC*mC
+
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;      // INT_ARG(9): 0-NCHW,  1-NHWC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    REQUIRE_TRUE(output->sizeAt(indIOioC) == iC*mC, 0, "DEPTHWISECONV2D CUDNN OP: the output_channels must be equal to input_channels * channels_multiplier = %i !", iC*mC);
+    if (bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support format {oC, iC/groupCount, kH, kW}
+    newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC  --> iC, mC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    depthwiseConv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(depthwise_conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC] = iC*mC
+
+    const int paddingMode = INT_ARG(8);                                  // 0-VALID, 1-SAME, 2-CAUSAL
+
+    const int mC = weights->sizeAt(3);
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return mC == 1 && paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC] = [iC*mC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, mC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 4, 0,   "DEPTHWISECONV2D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    int trueoH, trueoW;          // correct output height, width
+    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {iC, mC, kH, kW}, gradW->dataType(),   gradW->getContext());     // cudnn support format {oC, iC/groupCount, kH, kW}
+    NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC  --> iC, mC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    depthwiseConv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
+
+    newGradW->permutei({2,3,0,1});  // [iC, mC, kH, kW] -> [kH, kW, iC, mC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC] = [iC*mC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+
+    const int paddingMode = INT_ARG(8);                                             // 0-VALID, 1-SAME, 2-CAUSAL
+    const int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;    // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    const int mC = weights->sizeAt(3);
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return mC == 1 && isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
index 9a3b2916b..bf614bfab 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
@@ -28,11 +28,12 @@
 #include <ops/declarable/helpers/convolutions.h>
 
 using namespace dnnl;
+using namespace samediff;
 
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(avgpool2d) {
+            PLATFORM_IMPL(avgpool2d, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
 
                 REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@@ -128,7 +129,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(avgpool2d) {
+            PLATFORM_CHECK(avgpool2d, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp
index 428bd6042..af1fd04fd 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp
@@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(avgpool2d_bp) {
+            PLATFORM_IMPL(avgpool2d_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
                 auto gradO = INPUT_VARIABLE(
@@ -138,7 +138,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(avgpool2d_bp) {
+            PLATFORM_CHECK(avgpool2d_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
index 22ace87de..2456625ef 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
@@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(avgpool3dnew) {
+            PLATFORM_IMPL(avgpool3dnew, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                 auto output = OUTPUT_VARIABLE(
@@ -130,7 +130,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(avgpool3dnew) {
+            PLATFORM_CHECK(avgpool3dnew, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp
index 0c52608a0..3fd8ab293 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp
@@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(avgpool3dnew_bp) {
+            PLATFORM_IMPL(avgpool3dnew_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                          // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                 auto gradO = INPUT_VARIABLE(
@@ -143,7 +143,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(avgpool3dnew_bp) {
+            PLATFORM_CHECK(avgpool3dnew_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
index c7111cc7a..8974cef14 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
@@ -375,7 +375,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     *dLdI += xMinusMean;
 }
 
-PLATFORM_IMPL(batchnorm) {
+PLATFORM_IMPL(batchnorm, ENGINE_CPU) {
 
     auto input    = INPUT_VARIABLE(0);  // 2D:nc, 4D:nchw, 5D:ncdhw
     auto mean     = INPUT_VARIABLE(1);  // [c]
@@ -455,7 +455,7 @@ PLATFORM_IMPL(batchnorm) {
 }
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(batchnorm) {
+PLATFORM_CHECK(batchnorm, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     // if (::optimalLevel() < 2)
     //     return false;
@@ -632,7 +632,7 @@ PLATFORM_CHECK(batchnorm) {
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(batchnorm_bp) {
+PLATFORM_IMPL(batchnorm_bp, ENGINE_CPU) {
 
     NDArray* input    = INPUT_VARIABLE(0);                  // 2D:nc, 4D:nchw, 5D:ncdhw
     NDArray* mean     = INPUT_VARIABLE(1);                  // [c]
@@ -735,7 +735,7 @@ PLATFORM_IMPL(batchnorm_bp) {
 }
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(batchnorm_bp) {
+PLATFORM_CHECK(batchnorm_bp, ENGINE_CPU) {
     NDArray* input    = INPUT_VARIABLE(0);      // 2D:nc, 4D:nchw, 5D:ncdhw
     NDArray* mean     = INPUT_VARIABLE(1);      // [c]
     NDArray* variance = INPUT_VARIABLE(2);      // [c]
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
index a01679740..ba1711032 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
@@ -113,7 +113,7 @@ static void conv2d_mkldnn(nd4j::graph::Context &block, const NDArray *input, con
 }
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv2d) {
+PLATFORM_IMPL(conv2d, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
     auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
@@ -137,7 +137,7 @@ PLATFORM_IMPL(conv2d) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(conv2d) {
+PLATFORM_CHECK(conv2d, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     if (::optimalLevel() < 2)
         return false;
@@ -151,7 +151,7 @@ PLATFORM_CHECK(conv2d) {
 }
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv2d_bp) {
+PLATFORM_IMPL(conv2d_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
     auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
@@ -328,7 +328,7 @@ PLATFORM_IMPL(conv2d_bp) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(conv2d_bp) {
+PLATFORM_CHECK(conv2d_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
     auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
index 1e28e76a5..0a79df793 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
@@ -34,7 +34,7 @@ namespace ops       {
 namespace platforms {
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv3dnew) {
+PLATFORM_IMPL(conv3dnew, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(
             0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
@@ -150,7 +150,7 @@ PLATFORM_IMPL(conv3dnew) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(conv3dnew) {
+PLATFORM_CHECK(conv3dnew, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     if (::optimalLevel() < 2)
         return false;
@@ -167,7 +167,7 @@ PLATFORM_CHECK(conv3dnew) {
 
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv3dnew_bp) {
+PLATFORM_IMPL(conv3dnew_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(
             0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(
@@ -374,7 +374,7 @@ PLATFORM_IMPL(conv3dnew_bp) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(conv3dnew_bp) {
+PLATFORM_CHECK(conv3dnew_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(
             0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
index f5c37a647..6db569eec 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
@@ -349,7 +349,7 @@ static void deconv2dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d) {
+PLATFORM_IMPL(deconv2d, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, oC, iC] always
@@ -406,7 +406,7 @@ PLATFORM_IMPL(deconv2d) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(deconv2d) {
+PLATFORM_CHECK(deconv2d, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     // if (::optimalLevel() < 2)
     //     return false;
@@ -435,7 +435,7 @@ PLATFORM_CHECK(deconv2d) {
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d_bp) {
+PLATFORM_IMPL(deconv2d_bp, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC] always
@@ -506,7 +506,7 @@ PLATFORM_IMPL(deconv2d_bp) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(deconv2d_bp) {
+PLATFORM_CHECK(deconv2d_bp, ENGINE_CPU) {
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC] always
     auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
index fac53e877..90ddb828e 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
@@ -145,7 +145,7 @@ static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* grad
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d_tf) {
+PLATFORM_IMPL(deconv2d_tf, ENGINE_CPU) {
 
     auto gradO      = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
     auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
@@ -222,7 +222,7 @@ PLATFORM_IMPL(deconv2d_tf) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(deconv2d_tf) {
+PLATFORM_CHECK(deconv2d_tf, ENGINE_CPU) {
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
     auto gradO   = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
     auto gradI   = OUTPUT_VARIABLE(0);                                               // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW), gradI
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
index 7958ff2ce..a678e0185 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
@@ -360,7 +360,7 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv3d) {
+PLATFORM_IMPL(deconv3d, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, oC, iC] always
@@ -421,7 +421,7 @@ PLATFORM_IMPL(deconv3d) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(deconv3d) {
+PLATFORM_CHECK(deconv3d, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     // if (::optimalLevel() < 2)
     //     return false;
@@ -451,7 +451,7 @@ PLATFORM_CHECK(deconv3d) {
 
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv3d_bp) {
+PLATFORM_IMPL(deconv3d_bp, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC] always
@@ -525,7 +525,7 @@ PLATFORM_IMPL(deconv3d_bp) {
 }
 
 
-PLATFORM_CHECK(deconv3d_bp) {
+PLATFORM_CHECK(deconv3d_bp, ENGINE_CPU) {
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NHWC) or [bS, iD, iC, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC] always
     auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
index f589065ab..f3b745d09 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
@@ -362,7 +362,7 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
 
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(depthwise_conv2d) {
+PLATFORM_IMPL(depthwise_conv2d, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
@@ -400,7 +400,7 @@ PLATFORM_IMPL(depthwise_conv2d) {
 }
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(depthwise_conv2d) {
+PLATFORM_CHECK(depthwise_conv2d, ENGINE_CPU) {
     // we don't want to use mkldnn if cpu doesn't support avx/avx2
     if (::optimalLevel() < 2)
         return false;
@@ -427,7 +427,7 @@ PLATFORM_CHECK(depthwise_conv2d) {
 }
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(depthwise_conv2d_bp) {
+PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
@@ -476,7 +476,7 @@ PLATFORM_IMPL(depthwise_conv2d_bp) {
 }
 
 //////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(depthwise_conv2d_bp) {
+PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CPU) {
 
     auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
     auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
index ecd8b4c1a..a0f2f6151 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
@@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(lrn) {
+            PLATFORM_IMPL(lrn, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
@@ -82,7 +82,7 @@ namespace nd4j {
                 return Status::OK();
             };
 
-            PLATFORM_CHECK(lrn) {
+            PLATFORM_CHECK(lrn, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
index 7417653b3..3371b16ad 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
@@ -365,7 +365,7 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
 }
 
 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(lstmLayer) {
+PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {
 
     const auto dataFormat    = INT_ARG(0);    // for unidirectional: 0 = [sL, bS, nIn], 1 = [bS, sL ,nIn], 2 = [bS, nIn, sL], for bidirectional: 3 = [sL, 2, bS, nOut] (for ONNX)
     const auto directionMode = INT_ARG(1);    // direction: 0 = fwd, 1 = bwd, 2 = bidirectional sum, 3 = bidirectional concat, 4 = bidirectional extra output dim (in conjunction with format dataFormat = 3)
@@ -493,7 +493,7 @@ PLATFORM_IMPL(lstmLayer) {
     return Status::OK();
 }
 
-PLATFORM_CHECK(lstmLayer) {
+PLATFORM_CHECK(lstmLayer, ENGINE_CPU) {
     const auto hasBiases  = B_ARG(0);   // indicates whether biases array is provided
     const auto hasInitH   = B_ARG(2);   // indicates whether initial output is provided
     const auto hasInitC   = B_ARG(3);   // indicates whether initial cell state is provided
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
index 03008fbc6..975cf7fe1 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
@@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(maxpool2d) {
+            PLATFORM_IMPL(maxpool2d, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
 
                 REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@@ -134,7 +134,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(maxpool2d) {
+            PLATFORM_CHECK(maxpool2d, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp
index e50bef362..686bdc7fb 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp
@@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(maxpool2d_bp) {
+            PLATFORM_IMPL(maxpool2d_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
                 auto gradO = INPUT_VARIABLE(
@@ -163,7 +163,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(maxpool2d_bp) {
+            PLATFORM_CHECK(maxpool2d_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
index 6f132bb56..604bdcb6b 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
@@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(maxpool3dnew) {
+            PLATFORM_IMPL(maxpool3dnew, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                 auto output = OUTPUT_VARIABLE(
@@ -140,7 +140,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(maxpool3dnew) {
+            PLATFORM_CHECK(maxpool3dnew, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp
index 4f51d6633..b684df1bb 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp
@@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
     namespace ops {
         namespace platforms {
-            PLATFORM_IMPL(maxpool3dnew_bp) {
+            PLATFORM_IMPL(maxpool3dnew_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(
                         0);                          // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                 auto gradO = INPUT_VARIABLE(
@@ -170,7 +170,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            PLATFORM_CHECK(maxpool3dnew_bp) {
+            PLATFORM_CHECK(maxpool3dnew_bp, ENGINE_CPU) {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
index c54bf4db5..b55103a02 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
@@ -29,6 +29,8 @@
 #include <ops/declarable/PlatformHelper.h>
 #include <platform_boilerplate.h>
 
+using namespace samediff;
+
 
 namespace nd4j{
     namespace ops {
@@ -36,50 +38,51 @@ namespace nd4j{
             /**
              * Here we actually declare our platform helpers
              */
-            DECLARE_PLATFORM(conv2d);
+            DECLARE_PLATFORM(conv2d, ENGINE_CPU);
 
-            DECLARE_PLATFORM(conv2d_bp);
+            DECLARE_PLATFORM(conv2d_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(avgpool2d);
+            DECLARE_PLATFORM(avgpool2d, ENGINE_CPU);
 
-            DECLARE_PLATFORM(avgpool2d_bp);
+            DECLARE_PLATFORM(avgpool2d_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(maxpool2d);
+            DECLARE_PLATFORM(maxpool2d, ENGINE_CPU);
 
-            DECLARE_PLATFORM(maxpool2d_bp);
+            DECLARE_PLATFORM(maxpool2d_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(conv3dnew);
+            DECLARE_PLATFORM(conv3dnew, ENGINE_CPU);
 
-            DECLARE_PLATFORM(conv3dnew_bp);
+            DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(maxpool3dnew);
+            DECLARE_PLATFORM(maxpool3dnew, ENGINE_CPU);
 
-            DECLARE_PLATFORM(maxpool3dnew_bp);
+            DECLARE_PLATFORM(maxpool3dnew_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(avgpool3dnew);
+            DECLARE_PLATFORM(avgpool3dnew, ENGINE_CPU);
 
-            DECLARE_PLATFORM(avgpool3dnew_bp);
+            DECLARE_PLATFORM(avgpool3dnew_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(lrn);
+            DECLARE_PLATFORM(lrn, ENGINE_CPU);
 
-            DECLARE_PLATFORM(batchnorm);
+            DECLARE_PLATFORM(batchnorm, ENGINE_CPU);
 
-            DECLARE_PLATFORM(batchnorm_bp);
+            DECLARE_PLATFORM(batchnorm_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(lstmLayer);
+            DECLARE_PLATFORM(lstmLayer, ENGINE_CPU);
 
-            DECLARE_PLATFORM(deconv2d);
+            DECLARE_PLATFORM(deconv2d, ENGINE_CPU);
 
-            DECLARE_PLATFORM(deconv2d_tf);
+            DECLARE_PLATFORM(deconv2d_tf, ENGINE_CPU);
 
-            DECLARE_PLATFORM(deconv3d);
+            DECLARE_PLATFORM(deconv3d, ENGINE_CPU);
 
-            DECLARE_PLATFORM(deconv2d_bp);
+            DECLARE_PLATFORM(deconv2d_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(deconv3d_bp);
+            DECLARE_PLATFORM(deconv3d_bp, ENGINE_CPU);
 
-            DECLARE_PLATFORM(depthwise_conv2d);
-            DECLARE_PLATFORM(depthwise_conv2d_bp);
+            DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CPU);
+            
+            DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CPU);
         }
     }
 
diff --git a/libnd4j/include/platform_boilerplate.h b/libnd4j/include/platform_boilerplate.h
index d3883bcf7..5c73a1b38 100644
--- a/libnd4j/include/platform_boilerplate.h
+++ b/libnd4j/include/platform_boilerplate.h
@@ -21,25 +21,37 @@
 #ifndef SD_PLATFORM_BOILERPLATE_H
 #define SD_PLATFORM_BOILERPLATE_H
 
-
-#define DECLARE_PLATFORM(NAME)      class ND4J_EXPORT PLATFORM_##NAME : public PlatformHelper {\
-                                    public: \
-                                        PLATFORM_##NAME() :  PlatformHelper(#NAME) { } \
-                                        bool isUsable(graph::Context &context) override; \
-                                        Nd4jStatus invokeHelper(graph::Context &context) override; \
-                                    };
-
-#define PLATFORM_IMPL(NAME)         struct ND4J_EXPORT __registratorPlatformHelper_##NAME { \
-                                        __registratorPlatformHelper_##NAME() { \
-                                            auto helper = new PLATFORM_##NAME(); \
-                                            OpRegistrator::getInstance()->registerHelper(helper); \
-                                        } \
-                                    }; \
-                                    static __registratorPlatformHelper_##NAME platformHelper_##NAME; \
-                                    Nd4jStatus PLATFORM_##NAME::invokeHelper(nd4j::graph::Context &block)
+#include <execution/Engine.h>
 
 
-#define PLATFORM_CHECK(NAME)        bool PLATFORM_##NAME::isUsable(graph::Context &block)
+
+#define CONCATP(A,B) A ##_##B
+
+
+#define DECLARE_PLATFORM_F(NAME, ENGINE, CNAME)      class ND4J_EXPORT PLATFORM_##CNAME : public PlatformHelper {\
+                                                     public: \
+                                                        PLATFORM_##CNAME() :  PlatformHelper(#NAME, samediff::Engine::ENGINE) { } \
+                                                        bool isUsable(graph::Context &context) override; \
+                                                        Nd4jStatus invokeHelper(graph::Context &context) override; \
+                                                    };
+
+#define DECLARE_PLATFORM(NAME, ENGINE) DECLARE_PLATFORM_F(NAME, ENGINE, NAME ##_## ENGINE)
+
+#define PLATFORM_IMPL_F(NAME, ENGINE, CNAME)         struct ND4J_EXPORT __registratorPlatformHelper_##CNAME { \
+                                                        __registratorPlatformHelper_##CNAME() { \
+                                                            auto helper = new PLATFORM_##CNAME(); \
+                                                            OpRegistrator::getInstance()->registerHelper(helper); \
+                                                        } \
+                                                    }; \
+                                                    static __registratorPlatformHelper_##CNAME platformHelper_##CNAME; \
+                                                    Nd4jStatus PLATFORM_##CNAME::invokeHelper(nd4j::graph::Context &block)
+
+
+#define PLATFORM_IMPL(NAME, ENGINE) PLATFORM_IMPL_F(NAME, ENGINE, NAME ##_## ENGINE)
+
+
+#define PLATFORM_CHECK_F(NAME, ENGINE, CNAME)        bool PLATFORM_##CNAME::isUsable(graph::Context &block)
+#define PLATFORM_CHECK(NAME, ENGINE) PLATFORM_CHECK_F(NAME, ENGINE, NAME ##_## ENGINE)
 
 
 #endif //SD_PLATFORM_BOILERPLATE_H
diff --git a/libnd4j/include/play.h b/libnd4j/include/play.h
index ecafe84ea..d0fecee82 100644
--- a/libnd4j/include/play.h
+++ b/libnd4j/include/play.h
@@ -21,8 +21,9 @@
 #ifndef LIBND4J_PLAY_H
 #define LIBND4J_PLAY_H
 
-#include <type_boilerplate.h>
-
+//#include <type_boilerplate.h>
+#include <platform_boilerplate.h>
+/*
 #define DATA_TYPES \
         (DATA_FLOAT, float) ,\
         (DATA_DOUBLE, double)
@@ -41,6 +42,9 @@
 
 
 BUILD_SINGLE_TEMPLATE_TWICE(template class functionName, , DATA_TYPES)
+ */
+
+DECLARE_PLATFORM(conv2d, ENGINE_CPU)
 
 //BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functionName, (signature), DATA_TYPES, Y_TYPES);
 
diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
index 9ee58797e..f538eb9cd 100644
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@@ -135,12 +135,18 @@ elseif(CUDA_BLAS)
 
 	add_executable(runtests ${TEST_SOURCES})
 
-    message("MSVC runtime for tests: ${MSVC_RT_LIB}")
+    if (WIN32)
+        message("MSVC runtime for tests: ${MSVC_RT_LIB}")
+    endif()
 
     # applies to windows only
     set_property(TARGET runtests PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
     set_property(TARGET gtest PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
     set_property(TARGET gtest_main PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
 
-	target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} gtest gtest_main)
+    if (HAVE_CUDNN)
+        message("CUDNN library: ${CUDNN}")
+    endif()
+
+	target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN} gtest gtest_main)
 endif()
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index d9335a6d6..9ed9f0ee6 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -308,7 +308,7 @@ TEST_F(ConvolutionTests1, conv2d_8) {
     auto results = op.execute({&input, &weights, &bias}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
-    // output->printIndexedBuffer();
+    // output->printBuffer();
 
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -635,25 +635,63 @@ TYPED_TEST(TypedConvolutionTests1, conv2D_BP_NoBias_1) {
 }
 
 TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
-    TypeParam _expBFF[] = {10025.0f,   10350.0f,   10675.0f,   11000.0f,   11325.0f,   11650.0f,   13275.0f,   13600.0f,   13925.0f,   14250.0f,   14575.0f,   14900.0f,   16525.0f,   16850.0f,   17175.0f,   17500.0f,   17825.0f,   18150.0f,   19775.0f,   20100.0f,   20425.0f,   20750.0f,   21075.0f,   21400.0f,   23025.0f,   23350.0f,   23675.0f,   24000.0f,   24325.0f,   24650.0f,   26275.0f,   26600.0f,   26925.0f,   27250.0f,   27575.0f,   27900.0f,   53150.0f,   55350.0f,   57550.0f,   59750.0f,   61950.0f,   64150.0f,   75150.0f,   77350.0f,   79550.0f,   81750.0f,   83950.0f,   86150.0f,   97150.0f,   99350.0f,  101550.0f,  103750.0f,  105950.0f,  108150.0f,   119150.0f,  121350.0f,  123550.0f,  125750.0f,  127950.0f,  130150.0f,   141150.0f,  143350.0f,  145550.0f,  147750.0f,  149950.0f,  152150.0f,   163150.0f,  165350.0f,  167550.0f,  169750.0f,  171950.0f,  174150.0f,   119400.0f,  120350.0f,  121300.0f,  122250.0f,  123200.0f,  124150.0f,   128900.0f,  129850.0f,  130800.0f,  131750.0f,  132700.0f,  133650.0f,   138400.0f,  139350.0f,  140300.0f,  141250.0f,  142200.0f,  143150.0f,   147900.0f,  148850.0f,  149800.0f,  150750.0f,  151700.0f,  152650.0f,   157400.0f,  158350.0f,  159300.0f,  160250.0f,  161200.0f,  162150.0f,   166900.0f,  167850.0f,  168800.0f,  169750.0f,  170700.0f,  171650.0f,   350025.0f,  352850.0f,  355675.0f,  358500.0f,  361325.0f,  364150.0f,   378275.0f,  381100.0f,  383925.0f,  386750.0f,  389575.0f,  392400.0f,   406525.0f,  409350.0f,  412175.0f,  415000.0f,  417825.0f,  420650.0f,   434775.0f,  437600.0f,  440425.0f,  443250.0f,  446075.0f,  448900.0f,   463025.0f,  465850.0f,  468675.0f,  471500.0f,  474325.0f,  477150.0f,   491275.0f,  494100.0f,  496925.0f,  499750.0f,  502575.0f,  505400.0f,   353775.0f,  355350.0f,  356925.0f,  358500.0f,  360075.0f,  361650.0f,   369525.0f,  371100.0f,  372675.0f,  374250.0f,  375825.0f,  377400.0f,   385275.0f,  386850.0f,  388425.0f,  390000.0f,  391575.0f,  393150.0f,   401025.0f,  402600.0f,  404175.0f,  405750.0f,  407325.0f,  408900.0f,   416775.0f,  418350.0f,  419925.0f,  421500.0f,  423075.0f,  424650.0f,   432525.0f,  434100.0f,  435675.0f,  437250.0f,  438825.0f,  440400.0f,   771900.0f,  775350.0f,  778800.0f,  782250.0f,  785700.0f,  789150.0f,   806400.0f,  809850.0f,  813300.0f,  816750.0f,  820200.0f,  823650.0f,   840900.0f,  844350.0f,  847800.0f,  851250.0f,  854700.0f,  858150.0f,   875400.0f,  878850.0f,  882300.0f,  885750.0f,  889200.0f,  892650.0f,   909900.0f,  913350.0f,  916800.0f,  920250.0f,  923700.0f,  927150.0f,   944400.0f,  947850.0f,  951300.0f,  954750.0f,  958200.0f,  961650.0f,   107525.0f,  107850.0f,  108175.0f,  108500.0f,  108825.0f,  109150.0f,   110775.0f,  111100.0f,  111425.0f,  111750.0f,  112075.0f,  112400.0f,   114025.0f,  114350.0f,  114675.0f,  115000.0f,  115325.0f,  115650.0f,   117275.0f,  117600.0f,  117925.0f,  118250.0f,  118575.0f,  118900.0f,   120525.0f,  120850.0f,  121175.0f,  121500.0f,  121825.0f,  122150.0f,   123775.0f,  124100.0f,  124425.0f,  124750.0f,  125075.0f,  125400.0f,   713150.0f,  715350.0f,  717550.0f,  719750.0f,  721950.0f,  724150.0f,   735150.0f,  737350.0f,  739550.0f,  741750.0f,  743950.0f,  746150.0f,   757150.0f,  759350.0f,  761550.0f,  763750.0f,  765950.0f,  768150.0f,   779150.0f,  781350.0f,  783550.0f,  785750.0f,  787950.0f,  790150.0f,   801150.0f,  803350.0f,  805550.0f,  807750.0f,  809950.0f,  812150.0f,   823150.0f,  825350.0f,  827550.0f,  829750.0f,  831950.0f,  834150.0f,   404400.0f,  405350.0f,  406300.0f,  407250.0f,  408200.0f,  409150.0f,   413900.0f,  414850.0f,  415800.0f,  416750.0f,  417700.0f,  418650.0f,   423400.0f,  424350.0f,  425300.0f,  426250.0f,  427200.0f,  428150.0f,   432900.0f,  433850.0f,  434800.0f,  435750.0f,  436700.0f,  437650.0f,   442400.0f,  443350.0f,  444300.0f,  445250.0f,  446200.0f,  447150.0f,   451900.0f,  452850.0f,  453800.0f,  454750.0f,  455700.0f,  456650.0f,   1197525.0f, 1200350.0f, 1203175.0f, 1206000.0f, 1208825.0f, 1211650.0f,   1225775.0f, 1228600.0f, 1231425.0f, 1234250.0f, 1237075.0f, 1239900.0f,   1254025.0f, 1256850.0f, 1259675.0f, 1262500.0f, 1265325.0f, 1268150.0f,   1282275.0f, 1285100.0f, 1287925.0f, 1290750.0f, 1293575.0f, 1296400.0f,   1310525.0f, 1313350.0f, 1316175.0f, 1319000.0f, 1321825.0f, 1324650.0f,   1338775.0f, 1341600.0f, 1344425.0f, 1347250.0f, 1350075.0f, 1352900.0f,   826275.0f,  827850.0f,  829425.0f,  831000.0f,  832575.0f,  834150.0f,   842025.0f,  843600.0f,  845175.0f,  846750.0f,  848325.0f,  849900.0f,   857775.0f,  859350.0f,  860925.0f,  862500.0f,  864075.0f,  865650.0f,   873525.0f,  875100.0f,  876675.0f,  878250.0f,  879825.0f,  881400.0f,   889275.0f,  890850.0f,  892425.0f,  894000.0f,  895575.0f,  897150.0f,   905025.0f,  906600.0f,  908175.0f,  909750.0f,  911325.0f,  912900.0f,   1806900.0f, 1810350.0f, 1813800.0f, 1817250.0f, 1820700.0f, 1824150.0f,   1841400.0f, 1844850.0f, 1848300.0f, 1851750.0f, 1855200.0f, 1858650.0f,   1875900.0f, 1879350.0f, 1882800.0f, 1886250.0f, 1889700.0f, 1893150.0f,   1910400.0f, 1913850.0f, 1917300.0f, 1920750.0f, 1924200.0f, 1927650.0f,   1944900.0f, 1948350.0f, 1951800.0f, 1955250.0f, 1958700.0f, 1962150.0f,   1979400.0f, 1982850.0f, 1986300.0f, 1989750.0f, 1993200.0f, 1996650.f};
-    Nd4jLong _expSFF[] = {4, 2, 6, 6, 6, 216, 36, 6, 1, typeid(TypeParam) == typeid(float) ? 8192 : 16384, 1, 99,};
-    NDArray expFF(_expBFF, _expSFF);
-    TypeParam _exp2BFF[] = {827.4900282f,   832.2350283f,   836.9800284f,   841.725028f,   846.4700287f,   851.2150288f,   874.9400293f,   879.6850294f,   884.4300295f,   889.1750296f,   893.9200297f,   898.665029f,   922.3900304f,   927.1350305f,   931.8800306f,   936.6250307f,   941.3700308f,   946.1150309f,   969.8400315f,   974.5850316f,   979.3300317f,   984.0750318f,   988.8200319f,   993.5650320f,   1017.2900326f,  1022.0350327f,  1026.7800328f,  1031.5250329f,   1036.2700330f,  1041.0150331f,  1064.7400337f,  1069.4850338f,   1074.2300339f,  1078.9750340f,  1083.7200341f,  1088.4650342f,   1822.4550553f,  1833.995055f,   1845.5350558f,  1857.075056f,   1868.6150563f,  1880.1550566f,  1937.8550578f,  1949.3950581f,   1960.9350583f,  1972.4750586f,  1984.015058f,   1995.5550591f,   2053.2550604f,  2064.7950606f,  2076.3350609f,  2087.8750611f,   2099.4150614f,  2110.955061f,   2168.6550629f,  2180.1950632f,   2191.7350634f,  2203.2750637f,  2214.8150639f,  2226.3550642f,   2284.0550655f,  2295.5950657f,  2307.1350660f,  2318.6750662f,   2330.2150665f,  2341.7550667f,  2399.4550680f,  2410.9950683f,   2422.5350685f,  2434.0750688f,  2445.6150690f,  2457.1550693f,   2817.419968f,   2835.7549686f,  2854.0899683f,  2872.4249680f,   2890.7599677f,  2909.0949674f,  3000.7699660f,  3019.104965f,   3037.4399655f,  3055.7749652f,  3074.1099649f,  3092.4449646f,   3184.1199632f,  3202.4549629f,  3220.789962f,   3239.1249624f,   3257.4599621f,  3275.7949618f,  3367.4699604f,  3385.8049601f,   3404.1399598f,  3422.474959f,   3440.8099593f,  3459.1449590f,   3550.8199576f,  3569.1549573f,  3587.4899570f,  3605.8249567f,   3624.1599565f,  3642.4949562f,  3734.1699548f,  3752.5049545f,   3770.8399542f,  3789.1749539f,  3807.5099536f,  3825.8449534f,   3812.385098f,   3837.5150988f,  3862.6450994f,  3887.7751000f,   3912.9051006f,  3938.0351012f,  4063.6851041f,  4088.8151047f,   4113.9451053f,  4139.0751059f,  4164.2051065f,  4189.3351071f,   4314.9851100f,  4340.1151106f,  4365.2451112f,  4390.3751118f,   4415.5051124f,  4440.6351130f,  4566.2851159f,  4591.4151165f,   4616.5451171f,  4641.6751177f,  4666.805118f,   4691.9351188f,   4817.5851218f,  4842.7151224f,  4867.8451230f,  4892.975123f,   4918.1051241f,  4943.2351247f,  5068.8851277f,  5094.0151283f,   5119.1451288f,  5144.2751294f,  5169.4051300f,  5194.5351306f,   4807.3499803f,  4839.2749801f,  4871.1999799f,  4903.1249797f,   4935.0499795f,  4966.9749793f,  5126.5999784f,  5158.5249782f,   5190.4499780f,  5222.3749778f,  5254.2999777f,  5286.2249775f,   5445.8499765f,  5477.774976f,   5509.6999762f,  5541.6249760f,   5573.5499758f,  5605.4749756f,  5765.0999747f,  5797.0249745f,   5828.9499743f,  5860.8749741f,  5892.7999739f,  5924.724973f,   6084.3499728f,  6116.2749726f,  6148.1999724f,  6180.1249723f,   6212.0499721f,  6243.9749719f,  6403.59997f,    6435.5249708f,   6467.4499706f,  6499.3749704f,  6531.2999702f,  6563.2249700f,   5802.3150007f,  5841.0350006f,  5879.7550005f,  5918.4750004f,   5957.195000f,   5995.9150003f,  6189.5149999f,  6228.2349998f,   6266.9549997f,  6305.6749996f,  6344.3949995f,  6383.114999f,   6576.7149990f,  6615.4349990f,  6654.1549989f,  6692.8749988f,   6731.5949987f,  6770.3149986f,  6963.9149982f,  7002.6349981f,   7041.3549981f,  7080.0749980f,  7118.7949979f,  7157.5149978f,   7351.1149974f,  7389.8349973f,  7428.5549972f,  7467.2749972f,   7505.9949971f,  7544.7149970f,  7738.3149966f,  7777.0349965f,   7815.7549964f,  7854.4749963f,  7893.1949963f,  7931.9149962f,   6797.2799488f,  6842.794948f,   6888.3099489f,  6933.8249490f,   6979.3399491f,  7024.8549492f,  7252.4299497f,  7297.9449498f,   7343.4599499f,  7388.9749500f,  7434.489950f,   7480.0049501f,   7707.5799506f,  7753.0949507f,  7798.6099508f,  7844.1249509f,   7889.6399510f,  7935.1549511f,  8162.7299515f,  8208.2449516f,   8253.7599517f,  8299.2749518f,  8344.7899519f,  8390.3049520f,   8617.8799525f,  8663.394952f,   8708.9099526f,  8754.4249527f,   8799.9399528f,  8845.4549529f,  9073.0299534f,  9118.5449535f,   9164.0599536f,  9209.5749537f,  9255.089953f,   9300.604953f,   7792.2451647f,  7844.5551655f,  7896.8651663f,  7949.1751671f,   8001.4851679f,  8053.7951686f,  8315.3451725f,  8367.6551733f,   8419.9651741f,  8472.2751749f,  8524.585175f,   8576.8951764f,   8838.4451803f,  8890.7551811f,  8943.0651819f,  8995.3751827f,   9047.6851834f,  9099.9951842f,  9361.5451881f,  9413.8551889f,   9466.1651897f,  9518.475190f,   9570.7851912f,  9623.0951920f,   9884.6451959f,  9936.9551967f,  9989.2651975f, 10041.5751982f,   10093.8851990f, 10146.1951998f, 10407.7452037f, 10460.0552045f,   10512.3652053f, 10564.6752060f, 10616.9852068f, 10669.2952076f,   8787.210074f,   8846.3150748f,  8905.4200750f,  8964.5250752f,   9023.6300755f,  9082.7350757f,  9378.2600768f,  9437.3650770f,   9496.4700773f,  9555.5750775f,  9614.6800777f,  9673.7850779f,   9969.3100791f, 10028.4150793f, 10087.5200795f, 10146.625079f,   10205.7300800f, 10264.8350802f, 10560.3600813f, 10619.465081f,   10678.5700818f, 10737.6750820f, 10796.7800822f, 10855.8850825f,   11151.4100836f, 11210.5150838f, 11269.6200840f, 11328.7250843f,   11387.8300845f, 11446.9350847f, 11742.4600858f, 11801.5650861f,   11860.6700863f, 11919.7750865f, 11978.880086f,  12037.9850870f,   9782.1750935f,  9848.0750935f,  9913.9750934f,  9979.8750934f,   10045.7750934f, 10111.6750933f, 10441.1750931f, 10507.0750931f,   10572.9750931f, 10638.8750930f, 10704.7750930f, 10770.6750930f,   11100.1750928f, 11166.0750927f, 11231.9750927f, 11297.8750927f,   11363.7750926f, 11429.6750926f, 11759.1750924f, 11825.0750924f,   11890.9750923f, 11956.8750923f, 12022.7750923f, 12088.6750922f,   12418.175092f,  12484.0750920f, 12549.9750920f, 12615.8750919f,   12681.7750919f, 12747.6750919f, 13077.1750917f, 13143.0750916f,   13208.9750916f, 13274.8750916f, 13340.7750915f, 13406.6750915f,   2250.990060f,   2255.7350610f,  2260.4800611f,  2265.2250612f,   2269.9700613f,  2274.7150614f,  2298.4400619f,  2303.185062f,   2307.9300622f,  2312.6750623f,  2317.4200624f,  2322.1650625f,   2345.8900630f,  2350.6350631f,  2355.380063f,   2360.1250634f,   2364.8700635f,  2369.6150636f,  2393.3400641f,  2398.0850642f,   2402.8300643f,  2407.5750644f,  2412.320064f,   2417.0650647f,   2440.7900652f,  2445.5350653f,  2450.2800654f,  2455.0250655f,   2459.7700656f,  2464.515065f,   2488.2400663f,  2492.9850664f,   2497.7300665f,  2502.4750666f,  2507.2200667f,  2511.9650668f,   5284.4551315f,  5295.9951318f,  5307.535132f,   5319.0751323f,   5330.6151326f,  5342.1551328f,  5399.8551341f,  5411.3951343f,   5422.9351346f,  5434.475134f,   5446.0151351f,  5457.5551354f,   5515.2551366f,  5526.7951369f,  5538.3351371f,  5549.8751374f,   5561.4151376f,  5572.9551379f,  5630.6551392f,  5642.1951394f,   5653.7351397f,  5665.2751399f,  5676.8151402f,  5688.3551404f,   5746.0551417f,  5757.5951420f,  5769.1351422f,  5780.6751425f,   5792.2151427f,  5803.7551430f,  5861.455144f,   5872.9951445f,   5884.5351448f,  5896.0751450f,  5907.6151453f,  5919.1551455f,   8317.919884f,   8336.2548841f,  8354.5898838f,  8372.9248835f,   8391.2598832f,  8409.59488f,    8501.2698815f,  8519.6048813f,   8537.9398810f,  8556.2748807f,  8574.6098804f,  8592.9448801f,   8684.6198787f,  8702.9548784f,  8721.2898782f,  8739.6248779f,   8757.9598776f,  8776.2948773f,  8867.9698759f,  8886.3048756f,   8904.6398753f,  8922.9748751f,  8941.3098748f,  8959.6448745f,   9051.3198731f,  9069.6548728f,  9087.9898725f,  9106.3248722f,   9124.6598720f,  9142.9948717f,  9234.6698703f,  9253.0048700f,   9271.3398697f,  9289.6748694f,  9308.0098691f,  9326.3448689f,   11351.3852747f, 11376.5152753f, 11401.6452759f, 11426.7752765f,   11451.9052771f, 11477.0352777f, 11602.6852806f, 11627.8152812f,   11652.9452818f, 11678.0752824f, 11703.2052830f, 11728.335283f,   11853.9852865f, 11879.1152871f, 11904.2452877f, 11929.3752883f,   11954.505288f,  11979.6352894f, 12105.2852924f, 12130.4152930f,   12155.545293f,  12180.6752941f, 12205.8052947f, 12230.9352953f,   12356.5852983f, 12381.715298f,  12406.8452994f, 12431.9753000f,   12457.1053006f, 12482.2353012f, 12607.8853041f, 12633.0153047f,   12658.1453053f, 12683.2753059f, 12708.4053065f, 12733.5353071f,   14384.8499244f, 14416.7749242f, 14448.6999240f, 14480.6249238f,   14512.549923f,  14544.4749235f, 14704.0999225f, 14736.024922f,   14767.9499222f, 14799.8749220f, 14831.7999218f, 14863.7249216f,   15023.3499207f, 15055.2749205f, 15087.1999203f, 15119.1249201f,   15151.0499199f, 15182.9749197f, 15342.5999188f, 15374.5249186f,   15406.4499184f, 15438.374918f,  15470.2999181f, 15502.2249179f,   15661.84991f,   15693.7749168f, 15725.6999166f, 15757.6249164f,   15789.5499162f, 15821.4749160f, 15981.0999151f, 16013.0249149f,   16044.9499147f, 16076.8749145f, 16108.7999143f, 16140.7249142f,   17418.314976f,  17457.0349761f, 17495.7549760f, 17534.4749759f,   17573.1949758f, 17611.9149757f, 17805.5149753f, 17844.234975f,   17882.9549752f, 17921.6749751f, 17960.3949750f, 17999.1149749f,   18192.7149745f, 18231.4349744f, 18270.154974f,  18308.8749743f,   18347.5949742f, 18386.3149741f, 18579.9149737f, 18618.6349736f,   18657.3549735f, 18696.074973f,  18734.7949734f, 18773.5149733f,   18967.1149729f, 19005.8349728f, 19044.5549727f, 19083.2749726f,   19121.994972f,  19160.7149725f, 19354.3149721f, 19393.0349720f,   19431.7549719f, 19470.4749718f, 19509.1949717f, 19547.914971f,   20451.7799765f, 20497.2949766f, 20542.8099767f, 20588.3249768f,   20633.8399769f, 20679.3549770f, 20906.929977f,  20952.4449775f,   20997.9599776f, 21043.4749777f, 21088.9899778f, 21134.5049779f,   21362.0799784f, 21407.5949785f, 21453.1099786f, 21498.624978f,   21544.139978f,  21589.6549788f, 21817.2299793f, 21862.7449794f,   21908.2599795f, 21953.7749796f, 21999.2899797f, 22044.8049798f,   22272.3799802f, 22317.8949803f, 22363.4099804f, 22408.9249805f,   22454.4399806f, 22499.9549807f, 22727.529981f,  22773.044981f,   22818.5599813f, 22864.0749814f, 22909.5899815f, 22955.1049816f,   23485.2453985f, 23537.555399f,  23589.8654000f, 23642.1754008f,   23694.4854016f, 23746.7954024f, 24008.3454063f, 24060.655407f,   24112.9654078f, 24165.2754086f, 24217.5854094f, 24269.8954102f,   24531.4454141f, 24583.7554148f, 24636.0654156f, 24688.3754164f,   24740.6854172f, 24792.99541f,   25054.545421f,  25106.8554226f,   25159.1654234f, 25211.4754242f, 25263.7854250f, 25316.0954257f,   25577.6454296f, 25629.9554304f, 25682.2654312f, 25734.5754320f,   25786.8854328f, 25839.1954335f, 26100.7454374f, 26153.0554382f,   26205.3654390f, 26257.6754398f, 26309.985440f,  26362.2954413f,   26518.7101423f, 26577.8151425f, 26636.920142f,  26696.0251430f,   26755.1301432f, 26814.2351434f, 27109.7601446f, 27168.8651448f,   27227.9701450f, 27287.0751452f, 27346.1801455f, 27405.2851457f,   27700.8101468f, 27759.9151470f, 27819.0201473f, 27878.1251475f,   27937.2301477f, 27996.33514f,   28291.8601491f, 28350.9651493f,   28410.0701495f, 28469.175149f,  28528.2801500f, 28587.3851502f,   28882.9101513f, 28942.0151516f, 29001.1201518f, 29060.2251520f,   29119.3301522f, 29178.4351525f, 29473.9601536f, 29533.0651538f,   29592.1701540f, 29651.2751543f, 29710.3801545f, 29769.4851547f,   29552.1750826f, 29618.0750825f, 29683.9750825f, 29749.8750825f,   29815.7750824f, 29881.6750824f, 30211.1750822f, 30277.0750822f,   30342.9750821f, 30408.8750821f, 30474.7750821f, 30540.6750820f,   30870.175081f,  30936.0750818f, 31001.9750818f, 31067.8750817f,   31133.7750817f, 31199.6750817f, 31529.1750815f, 31595.075081f,   31660.9750814f, 31726.8750814f, 31792.7750813f, 31858.6750813f,   32188.1750811f, 32254.0750811f, 32319.975081f,  32385.8750810f,   32451.7750810f, 32517.6750809f, 32847.1750808f, 32913.0750807f,   32978.9750807f, 33044.875080f,  33110.7750806f, 33176.67508062f};
-    Nd4jLong _exp2SFF[] = {4, 2, 10, 6, 6, 360, 36, 6, 1, typeid(TypeParam) == typeid(float) ? 8192 : 16384, 1, 99};
-    NDArray exp2FF(_exp2BFF, _exp2SFF);
 
     auto input = NDArrayFactory::create<TypeParam>('c', {2, 3, 10, 10});
-    auto weightsD = NDArrayFactory::create<TypeParam>('c', {2, 3, 5, 5});
-    auto weightsP = NDArrayFactory::create<TypeParam>('c', {10, 6, 1, 1});
+    auto weightsD = NDArrayFactory::create<TypeParam>('c', {5, 5, 3, 2}, {1.f, 76.f, 26.f, 101.f, 51.f, 126.f, 2.f, 77.f, 27.f, 102.f, 52.f, 127.f, 3.f, 78.f, 28.f, 103.f, 53.f, 128.f, 4.f, 79.f, 29.f, 104.f, 54.f, 129.f, 5.f, 80.f, 30.f, 105.f, 55.f, 130.f,
+                                        6.f, 81.f, 31.f, 106.f, 56.f, 131.f, 7.f, 82.f, 32.f, 107.f, 57.f, 132.f, 8.f, 83.f, 33.f, 108.f, 58.f, 133.f, 9.f, 84.f, 34.f, 109.f, 59.f, 134.f, 10.f, 85.f, 35.f, 110.f, 60.f, 135.f,
+                                        11.f, 86.f, 36.f, 111.f, 61.f, 136.f, 12.f, 87.f, 37.f, 112.f, 62.f, 137.f, 13.f, 88.f, 38.f, 113.f, 63.f, 138.f, 14.f, 89.f, 39.f, 114.f, 64.f, 139.f, 15.f, 90.f, 40.f, 115.f, 65.f, 140.f,
+                                        16.f, 91.f, 41.f, 116.f, 66.f, 141.f, 17.f, 92.f, 42.f, 117.f, 67.f, 142.f, 18.f, 93.f, 43.f, 118.f, 68.f, 143.f, 19.f, 94.f, 44.f, 119.f, 69.f, 144.f, 20.f, 95.f, 45.f, 120.f, 70.f, 145.f,
+                                        21.f, 96.f, 46.f, 121.f, 71.f, 146.f, 22.f, 97.f, 47.f, 122.f, 72.f, 147.f, 23.f, 98.f, 48.f, 123.f, 73.f, 148.f, 24.f, 99.f, 49.f, 124.f, 74.f, 149.f, 25.f, 100.f, 50.f, 125.f, 75.f, 150.f});
+    auto weightsP = NDArrayFactory::create<TypeParam>('c', {1, 1, 6, 10}, {0.0001f, 0.0007f, 0.0013f, 0.0019f, 0.0025f, 0.0031f, 0.0037f, 0.0043f, 0.0049f, 0.0055f,0.0002f, 0.0008f, 0.0014f, 0.0020f, 0.0026f, 0.0032f, 0.0038f, 0.0044f, 0.0050f, 0.0056f,
+                                        0.0003f, 0.0009f, 0.0015f, 0.0021f, 0.0027f, 0.0033f, 0.0039f, 0.0045f, 0.0051f, 0.0057f,0.0004f, 0.0010f, 0.0016f, 0.0022f, 0.0028f, 0.0034f, 0.0040f, 0.0046f, 0.0052f, 0.0058f,
+                                        0.0005f, 0.0011f, 0.0017f, 0.0023f, 0.0029f, 0.0035f, 0.0041f, 0.0047f, 0.0053f, 0.0059f,0.0006f, 0.0012f, 0.0018f, 0.0024f, 0.0030f, 0.0036f, 0.0042f, 0.0048f, 0.0054f, 0.0060f});
 
+    auto expFF = NDArrayFactory::create<TypeParam>('c', {2, 6, 6, 6}, {10025.0f,10350.0f,10675.0f,11000.0f,11325.0f,11650.0f,13275.0f,13600.0f,13925.0f,14250.0f,14575.0f,14900.0f,16525.0f,16850.0f,
+                                        17175.0f,17500.0f,17825.0f,18150.0f,19775.0f,20100.0f,20425.0f,20750.0f,21075.0f,21400.0f,23025.0f,23350.0f,23675.0f,24000.0f,
+                                        24325.0f,24650.0f,26275.0f,26600.0f,26925.0f,27250.0f,27575.0f,27900.0f,53150.0f,55350.0f,57550.0f,59750.0f,61950.0f,64150.0f,
+                                        75150.0f,77350.0f,79550.0f,81750.0f,83950.0f,86150.0f,97150.0f,99350.0f,101550.0f,103750.0f,105950.0f,108150.0f,119150.0f,
+                                        121350.0f,123550.0f,125750.0f,127950.0f,130150.0f,141150.0f,143350.0f,145550.0f,147750.0f,149950.0f,152150.0f,163150.0f,
+                                        165350.0f,167550.0f,169750.0f,171950.0f,174150.0f,119400.0f,120350.0f,121300.0f,122250.0f,123200.0f,124150.0f,128900.0f,
+                                        129850.0f,130800.0f,131750.0f,132700.0f,133650.0f,138400.0f,139350.0f,140300.0f,141250.0f,142200.0f,143150.0f,147900.0f,
+                                        148850.0f,149800.0f,150750.0f,151700.0f,152650.0f,157400.0f,158350.0f,159300.0f,160250.0f,161200.0f,162150.0f,166900.0f,
+                                        167850.0f,168800.0f,169750.0f,170700.0f,171650.0f,350025.0f,352850.0f,355675.0f,358500.0f,361325.0f,364150.0f,378275.0f,
+                                        381100.0f,383925.0f,386750.0f,389575.0f,392400.0f,406525.0f,409350.0f,412175.0f,415000.0f,417825.0f,420650.0f,434775.0f,
+                                        437600.0f,440425.0f,443250.0f,446075.0f,448900.0f,463025.0f,465850.0f,468675.0f,471500.0f,474325.0f,477150.0f,491275.0f,
+                                        494100.0f,496925.0f,499750.0f,502575.0f,505400.0f,353775.0f,355350.0f,356925.0f,358500.0f,360075.0f,361650.0f,369525.0f,
+                                        371100.0f,372675.0f,374250.0f,375825.0f,377400.0f,385275.0f,386850.0f,388425.0f,390000.0f,391575.0f,393150.0f,401025.0f,
+                                        402600.0f,404175.0f,405750.0f,407325.0f,408900.0f,416775.0f,418350.0f,419925.0f,421500.0f,423075.0f,424650.0f,432525.0f,
+                                        434100.0f,435675.0f,437250.0f,438825.0f,440400.0f,771900.0f,775350.0f,778800.0f,782250.0f,785700.0f,789150.0f,806400.0f,
+                                        809850.0f,813300.0f,816750.0f,820200.0f,823650.0f,840900.0f,844350.0f,847800.0f,851250.0f,854700.0f,858150.0f,875400.0f,
+                                        878850.0f,882300.0f,885750.0f,889200.0f,892650.0f,909900.0f,913350.0f,916800.0f,920250.0f,923700.0f,927150.0f,944400.0f,
+                                        947850.0f,951300.0f,954750.0f,958200.0f,961650.0f,107525.0f,107850.0f,108175.0f,108500.0f,108825.0f,109150.0f,110775.0f,
+                                        111100.0f,111425.0f,111750.0f,112075.0f,112400.0f,114025.0f,114350.0f,114675.0f,115000.0f,115325.0f,115650.0f,117275.0f,
+                                        117600.0f,117925.0f,118250.0f,118575.0f,118900.0f,120525.0f,120850.0f,121175.0f,121500.0f,121825.0f,122150.0f,123775.0f,
+                                        124100.0f,124425.0f,124750.0f,125075.0f,125400.0f,713150.0f,715350.0f,717550.0f,719750.0f,721950.0f,724150.0f,735150.0f,
+                                        737350.0f,739550.0f,741750.0f,743950.0f,746150.0f,757150.0f,759350.0f,761550.0f,763750.0f,765950.0f,768150.0f,779150.0f,
+                                        781350.0f,783550.0f,785750.0f,787950.0f,790150.0f,801150.0f,803350.0f,805550.0f,807750.0f,809950.0f,812150.0f,823150.0f,
+                                        825350.0f,827550.0f,829750.0f,831950.0f,834150.0f,404400.0f,405350.0f,406300.0f,407250.0f,408200.0f,409150.0f,413900.0f,
+                                        414850.0f,415800.0f,416750.0f,417700.0f,418650.0f,423400.0f,424350.0f,425300.0f,426250.0f,427200.0f,428150.0f,432900.0f,433850.0f,434800.0f,435750.0f,436700.0f,437650.0f,442400.0f,443350.0f,444300.0f,445250.0f,446200.0f,447150.0f,451900.0f,452850.0f,453800.0f,454750.0f,455700.0f,456650.0f,1197525.0f,1200350.0f,1203175.0f,1206000.0f,1208825.0f,1211650.0f,1225775.0f,1228600.0f,1231425.0f,1234250.0f,1237075.0f,1239900.0f,1254025.0f,1256850.0f,1259675.0f,1262500.0f,1265325.0f,1268150.0f,1282275.0f,1285100.0f,1287925.0f,1290750.0f,1293575.0f,1296400.0f,1310525.0f,1313350.0f,1316175.0f,1319000.0f,1321825.0f,1324650.0f,1338775.0f,1341600.0f,1344425.0f,1347250.0f,1350075.0f,1352900.0f,826275.0f,827850.0f,829425.0f,831000.0f,832575.0f,834150.0f,842025.0f,843600.0f,845175.0f,846750.0f,848325.0f,849900.0f,857775.0f,859350.0f,860925.0f,862500.0f,864075.0f,865650.0f,873525.0f,875100.0f,876675.0f,878250.0f,879825.0f,881400.0f,889275.0f,890850.0f,892425.0f,894000.0f,895575.0f,897150.0f,905025.0f,906600.0f,908175.0f,909750.0f,911325.0f,912900.0f,1806900.0f,1810350.0f,1813800.0f,1817250.0f,1820700.0f,1824150.0f,1841400.0f,1844850.0f,1848300.0f,1851750.0f,1855200.0f,1858650.0f,1875900.0f,1879350.0f,1882800.0f,1886250.0f,1889700.0f,1893150.0f,1910400.0f,1913850.0f,1917300.0f,1920750.0f,1924200.0f,1927650.0f,1944900.0f,1948350.0f,1951800.0f,1955250.0f,1958700.0f,1962150.0f,1979400.0f,1982850.0f,1986300.0f,1989750.0f,1993200.0f,1996650.f});
+    auto exp2FF = NDArrayFactory::create<TypeParam>('c', {2, 10, 6, 6}, {827.4900282f,832.2350283f,836.9800284f,841.725028f,846.4700287f,851.2150288f,874.9400293f,879.6850294f,884.4300295f,889.1750296f,893.9200297f,898.665029f,
+                                        922.3900304f,927.1350305f,931.8800306f,936.6250307f,941.3700308f,946.1150309f,969.8400315f,974.5850316f,979.3300317f,984.0750318f,988.8200319f,993.5650320f,
+                                        1017.2900326f,1022.0350327f,1026.7800328f,1031.5250329f,1036.2700330f,1041.0150331f,1064.7400337f,1069.4850338f,1074.2300339f,1078.9750340f,1083.7200341f,
+                                        1088.4650342f,1822.4550553f,1833.995055f,1845.5350558f,1857.075056f,1868.6150563f,1880.1550566f,1937.8550578f,1949.3950581f,1960.9350583f,1972.4750586f,
+                                        1984.015058f,1995.5550591f,2053.2550604f,2064.7950606f,2076.3350609f,2087.8750611f,2099.4150614f,2110.955061f,2168.6550629f,2180.1950632f,2191.7350634f,
+                                        2203.2750637f,2214.8150639f,2226.3550642f,2284.0550655f,2295.5950657f,2307.1350660f,2318.6750662f,2330.2150665f,2341.7550667f,2399.4550680f,2410.9950683f,
+                                        2422.5350685f,2434.0750688f,2445.6150690f,2457.1550693f,2817.419968f,2835.7549686f,2854.0899683f,2872.4249680f,2890.7599677f,2909.0949674f,3000.7699660f,
+                                        3019.104965f,3037.4399655f,3055.7749652f,3074.1099649f,3092.4449646f,3184.1199632f,3202.4549629f,3220.789962f,3239.1249624f,3257.4599621f,3275.7949618f,
+                                        3367.4699604f,3385.8049601f,3404.1399598f,3422.474959f,3440.8099593f,3459.1449590f,3550.8199576f,3569.1549573f,3587.4899570f,3605.8249567f,3624.1599565f,
+                                        3642.4949562f,3734.1699548f,3752.5049545f,3770.8399542f,3789.1749539f,3807.5099536f,3825.8449534f,3812.385098f,3837.5150988f,3862.6450994f,3887.7751000f,
+                                        3912.9051006f,3938.0351012f,4063.6851041f,4088.8151047f,4113.9451053f,4139.0751059f,4164.2051065f,4189.3351071f,4314.9851100f,4340.1151106f,4365.2451112f,
+                                        4390.3751118f,4415.5051124f,4440.6351130f,4566.2851159f,4591.4151165f,4616.5451171f,4641.6751177f,4666.805118f,4691.9351188f,4817.5851218f,4842.7151224f,
+                                        4867.8451230f,4892.975123f,4918.1051241f,4943.2351247f,5068.8851277f,5094.0151283f,5119.1451288f,5144.2751294f,5169.4051300f,5194.5351306f,4807.3499803f,
+                                        4839.2749801f,4871.1999799f,4903.1249797f,4935.0499795f,4966.9749793f,5126.5999784f,5158.5249782f,5190.4499780f,5222.3749778f,5254.2999777f,5286.2249775f,
+                                        5445.8499765f,5477.774976f,5509.6999762f,5541.6249760f,5573.5499758f,5605.4749756f,5765.0999747f,5797.0249745f,5828.9499743f,5860.8749741f,5892.7999739f,
+                                        5924.724973f,6084.3499728f,6116.2749726f,6148.1999724f,6180.1249723f,6212.0499721f,6243.9749719f,6403.59997f,6435.5249708f,6467.4499706f,6499.3749704f,
+                                        6531.2999702f,6563.2249700f,5802.3150007f,5841.0350006f,5879.7550005f,5918.4750004f,5957.195000f,5995.9150003f,6189.5149999f,6228.2349998f,6266.9549997f,
+                                        6305.6749996f,6344.3949995f,6383.114999f,6576.7149990f,6615.4349990f,6654.1549989f,6692.8749988f,6731.5949987f,6770.3149986f,6963.9149982f,7002.6349981f,
+                                        7041.3549981f,7080.0749980f,7118.7949979f,7157.5149978f,7351.1149974f,7389.8349973f,7428.5549972f,7467.2749972f,7505.9949971f,7544.7149970f,7738.3149966f,7777.0349965f,7815.7549964f,7854.4749963f,7893.1949963f,7931.9149962f,6797.2799488f,6842.794948f,6888.3099489f,6933.8249490f,6979.3399491f,7024.8549492f,7252.4299497f,7297.9449498f,7343.4599499f,7388.9749500f,7434.489950f,7480.0049501f,7707.5799506f,7753.0949507f,7798.6099508f,7844.1249509f,7889.6399510f,7935.1549511f,8162.7299515f,8208.2449516f,8253.7599517f,8299.2749518f,8344.7899519f,8390.3049520f,8617.8799525f,8663.394952f,8708.9099526f,8754.4249527f,8799.9399528f,8845.4549529f,9073.0299534f,9118.5449535f,9164.0599536f,9209.5749537f,9255.089953f,9300.604953f,7792.2451647f,7844.5551655f,7896.8651663f,7949.1751671f,8001.4851679f,8053.7951686f,8315.3451725f,8367.6551733f,8419.9651741f,8472.2751749f,8524.585175f,8576.8951764f,8838.4451803f,8890.7551811f,8943.0651819f,8995.3751827f,9047.6851834f,9099.9951842f,9361.5451881f,9413.8551889f,9466.1651897f,9518.475190f,9570.7851912f,9623.0951920f,9884.6451959f,9936.9551967f,9989.2651975f,10041.5751982f,10093.8851990f,10146.1951998f,10407.7452037f,10460.0552045f,10512.3652053f,10564.6752060f,10616.9852068f,10669.2952076f,8787.210074f,8846.3150748f,8905.4200750f,8964.5250752f,9023.6300755f,9082.7350757f,9378.2600768f,9437.3650770f,9496.4700773f,9555.5750775f,9614.6800777f,9673.7850779f,9969.3100791f,10028.4150793f,10087.5200795f,10146.625079f,10205.7300800f,10264.8350802f,10560.3600813f,10619.465081f,10678.5700818f,10737.6750820f,10796.7800822f,10855.8850825f,11151.4100836f,11210.5150838f,11269.6200840f,11328.7250843f,11387.8300845f,11446.9350847f,11742.4600858f,11801.5650861f,11860.6700863f,11919.7750865f,11978.880086f,12037.9850870f,9782.1750935f,9848.0750935f,9913.9750934f,9979.8750934f,10045.7750934f,10111.6750933f,10441.1750931f,10507.0750931f,10572.9750931f,10638.8750930f,10704.7750930f,10770.6750930f,11100.1750928f,11166.0750927f,11231.9750927f,11297.8750927f,11363.7750926f,11429.6750926f,11759.1750924f,11825.0750924f,11890.9750923f,11956.8750923f,12022.7750923f,12088.6750922f,12418.175092f,12484.0750920f,12549.9750920f,12615.8750919f,12681.7750919f,12747.6750919f,13077.1750917f,13143.0750916f,13208.9750916f,13274.8750916f,13340.7750915f,13406.6750915f,2250.990060f,2255.7350610f,2260.4800611f,2265.2250612f,2269.9700613f,2274.7150614f,2298.4400619f,2303.185062f,2307.9300622f,2312.6750623f,2317.4200624f,2322.1650625f,2345.8900630f,2350.6350631f,2355.380063f,2360.1250634f,2364.8700635f,2369.6150636f,2393.3400641f,2398.0850642f,2402.8300643f,2407.5750644f,2412.320064f,2417.0650647f,2440.7900652f,2445.5350653f,2450.2800654f,2455.0250655f,2459.7700656f,2464.515065f,2488.2400663f,2492.9850664f,2497.7300665f,2502.4750666f,2507.2200667f,2511.9650668f,5284.4551315f,5295.9951318f,5307.535132f,5319.0751323f,5330.6151326f,5342.1551328f,5399.8551341f,5411.3951343f,5422.9351346f,5434.475134f,5446.0151351f,5457.5551354f,5515.2551366f,5526.7951369f,5538.3351371f,5549.8751374f,5561.4151376f,5572.9551379f,5630.6551392f,5642.1951394f,5653.7351397f,5665.2751399f,5676.8151402f,5688.3551404f,5746.0551417f,5757.5951420f,5769.1351422f,5780.6751425f,5792.2151427f,5803.7551430f,5861.455144f,5872.9951445f,5884.5351448f,5896.0751450f,5907.6151453f,5919.1551455f,8317.919884f,8336.2548841f,8354.5898838f,8372.9248835f,8391.2598832f,8409.59488f,8501.2698815f,8519.6048813f,8537.9398810f,8556.2748807f,8574.6098804f,8592.9448801f,8684.6198787f,8702.9548784f,8721.2898782f,8739.6248779f,8757.9598776f,8776.2948773f,8867.9698759f,8886.3048756f,8904.6398753f,8922.9748751f,8941.3098748f,8959.6448745f,9051.3198731f,9069.6548728f,9087.9898725f,9106.3248722f,9124.6598720f,9142.9948717f,9234.6698703f,9253.0048700f,9271.3398697f,9289.6748694f,9308.0098691f,9326.3448689f,11351.3852747f,11376.5152753f,11401.6452759f,11426.7752765f,11451.9052771f,11477.0352777f,11602.6852806f,11627.8152812f,11652.9452818f,11678.0752824f,11703.2052830f,11728.335283f,11853.9852865f,11879.1152871f,11904.2452877f,11929.3752883f,11954.505288f,11979.6352894f,12105.2852924f,12130.4152930f,12155.545293f,12180.6752941f,12205.8052947f,12230.9352953f,12356.5852983f,12381.715298f,12406.8452994f,12431.9753000f,12457.1053006f,12482.2353012f,12607.8853041f,12633.0153047f,12658.1453053f,12683.2753059f,12708.4053065f,12733.5353071f,14384.8499244f,14416.7749242f,14448.6999240f,14480.6249238f,14512.549923f,14544.4749235f,14704.0999225f,14736.024922f,14767.9499222f,14799.8749220f,14831.7999218f,14863.7249216f,15023.3499207f,15055.2749205f,15087.1999203f,15119.1249201f,15151.0499199f,15182.9749197f,15342.5999188f,15374.5249186f,15406.4499184f,15438.374918f,15470.2999181f,15502.2249179f,15661.84991f,15693.7749168f,15725.6999166f,15757.6249164f,15789.5499162f,15821.4749160f,15981.0999151f,16013.0249149f,16044.9499147f,16076.8749145f,16108.7999143f,16140.7249142f,17418.314976f,17457.0349761f,17495.7549760f,17534.4749759f,17573.1949758f,17611.9149757f,17805.5149753f,17844.234975f,17882.9549752f,17921.6749751f,17960.3949750f,17999.1149749f,18192.7149745f,18231.4349744f,18270.154974f,18308.8749743f,18347.5949742f,18386.3149741f,18579.9149737f,18618.6349736f,18657.3549735f,18696.074973f,18734.7949734f,18773.5149733f,18967.1149729f,19005.8349728f,19044.5549727f,19083.2749726f,19121.994972f,19160.7149725f,19354.3149721f,19393.0349720f,19431.7549719f,19470.4749718f,19509.1949717f,19547.914971f,20451.7799765f,20497.2949766f,20542.8099767f,20588.3249768f,20633.8399769f,20679.3549770f,20906.929977f,20952.4449775f,20997.9599776f,21043.4749777f,21088.9899778f,21134.5049779f,21362.0799784f,21407.5949785f,21453.1099786f,21498.624978f,21544.139978f,21589.6549788f,21817.2299793f,21862.7449794f,21908.2599795f,21953.7749796f,21999.2899797f,22044.8049798f,22272.3799802f,22317.8949803f,22363.4099804f,22408.9249805f,22454.4399806f,22499.9549807f,22727.529981f,22773.044981f,22818.5599813f,22864.0749814f,22909.5899815f,22955.1049816f,23485.2453985f,23537.555399f,23589.8654000f,23642.1754008f,23694.4854016f,23746.7954024f,24008.3454063f,24060.655407f,24112.9654078f,24165.2754086f,24217.5854094f,24269.8954102f,24531.4454141f,24583.7554148f,24636.0654156f,24688.3754164f,24740.6854172f,24792.99541f,25054.545421f,25106.8554226f,25159.1654234f,25211.4754242f,25263.7854250f,25316.0954257f,25577.6454296f,25629.9554304f,25682.2654312f,25734.5754320f,25786.8854328f,25839.1954335f,26100.7454374f,26153.0554382f,26205.3654390f,26257.6754398f,26309.985440f,26362.2954413f,26518.7101423f,26577.8151425f,26636.920142f,26696.0251430f,26755.1301432f,26814.2351434f,27109.7601446f,27168.8651448f,27227.9701450f,27287.0751452f,27346.1801455f,27405.2851457f,27700.8101468f,27759.9151470f,27819.0201473f,27878.1251475f,27937.2301477f,27996.33514f,28291.8601491f,28350.9651493f,28410.0701495f,28469.175149f,28528.2801500f,28587.3851502f,28882.9101513f,28942.0151516f,29001.1201518f,29060.2251520f,29119.3301522f,29178.4351525f,29473.9601536f,29533.0651538f,29592.1701540f,29651.2751543f,29710.3801545f,29769.4851547f,29552.1750826f,29618.0750825f,29683.9750825f,29749.8750825f,29815.7750824f,29881.6750824f,30211.1750822f,30277.0750822f,30342.9750821f,30408.8750821f,30474.7750821f,30540.6750820f,30870.175081f,30936.0750818f,31001.9750818f,31067.8750817f,31133.7750817f,31199.6750817f,31529.1750815f,31595.075081f,31660.9750814f,31726.8750814f,31792.7750813f,31858.6750813f,32188.1750811f,32254.0750811f,32319.975081f,32385.8750810f,32451.7750810f,32517.6750809f,32847.1750808f,32913.0750807f,32978.9750807f,33044.875080f,33110.7750806f,33176.67508062f});
 
     input.linspace(1);
-    weightsD.linspace(1);
-    weightsP.linspace(1);
-    weightsD.permutei({2,3,1,0});
-    weightsP.permutei({2,3,1,0});
-
-    weightsP.applyScalar(scalar::Divide, 10000.0, weightsP);
 
     nd4j::ops::sconv2d op;
     auto resultFF = op.execute({&input, &weightsD}, {}, {5, 5, 1, 1, 0, 0, 1, 1, 0}, {});
@@ -669,6 +707,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
     auto result2D = op2d.execute({z, &weightsP}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, {});
 
     auto z2d = result2D->at(0);
+    // z2d->printBuffer();
 
     ASSERT_TRUE(z2d->isSameShape(&exp2FF));
     ASSERT_TRUE(z2d->equalsTo(&exp2FF));
@@ -1672,6 +1711,46 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test5) {
     delete results;
 }
 
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test6) {
+
+    int bS=2, iH=4,iW=3,  iC=2,mC=1,  kH=3,kW=2,  sH=1,sW=1,  pH=0,pW=0,  dH=1,dW=1;
+    int       oH=2,oW=2;
+    int       oC=iC*mC;
+    int paddingMode = 0;             // 1-SAME, 0-VALID;
+    int dataFormat  = 0;             // 1-NHWC, 0-NCHW
+
+    auto input    = NDArrayFactory::create<double>('c', {bS, iC, iH, iW});
+    auto weights  = NDArrayFactory::create<double>('c', {kH, kW, iC, mC});
+    auto bias     = NDArrayFactory::create<double>('c', {oC}, {3,4});
+    auto gradO    = NDArrayFactory::create<double>('c', {bS, oC, oH, oW});
+
+    auto expGradI = NDArrayFactory::create<double>('c', {bS, iC, iH, iW},{0.001, 0.005, 0.006, 0.008, 0.03, 0.026, 0.024, 0.07, 0.05, 0.027, 0.069, 0.044, 0.01,
+                        0.032, 0.024, 0.044, 0.12, 0.08, 0.092, 0.224, 0.136, 0.07, 0.164, 0.096, 0.009, 0.037, 0.03, 0.056, 0.158, 0.106, 0.136,
+                        0.326, 0.194, 0.099, 0.229, 0.132, 0.026, 0.08, 0.056, 0.108, 0.28, 0.176, 0.22, 0.512, 0.296, 0.15, 0.34, 0.192});
+
+    auto expGradW = NDArrayFactory::create<double>('c', {kH, kW, iC, mC}, {1.04, 1.68, 1.04, 1.68, 1.04, 1.68, 1.04, 1.68, 1.04, 1.68, 1.04, 1.68});
+
+    input = 2.;
+    weights.linspace(0.1, 0.1);
+    gradO.linspace(0.01, 0.01);
+
+    nd4j::ops::depthwise_conv2d_bp op;
+    auto results = op.execute({&input, &weights, &bias, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    auto* gradI = results->at(0);
+    auto* gradW = results->at(1);
+
+    ASSERT_EQ(Status::OK(), results->status());
+
+    ASSERT_TRUE(expGradI.isSameShape(gradI));
+    ASSERT_TRUE(expGradI.equalsTo(gradI));
+
+    ASSERT_TRUE(expGradW.isSameShape(gradW));
+    ASSERT_TRUE(expGradW.equalsTo(gradW));
+
+    delete results;
+}
+
 //////////////////////////////////////////////////////////////////////
 TYPED_TEST(TypedConvolutionTests1, conv3d_test1) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
index 01cce62c4..a16d9cfbd 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
@@ -2416,7 +2416,7 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_9) {
     ASSERT_EQ(Status::OK(), results->status());
 
     ASSERT_TRUE(expOutput.isSameShape(output));
-    ASSERT_TRUE(expOutput.equalsTo(output));
+    ASSERT_TRUE(expOutput.equalsTo(output, 1e-4));
 
     delete results;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu b/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
new file mode 100644
index 000000000..8809ad894
--- /dev/null
+++ b/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
@@ -0,0 +1,128 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <initializer_list>
+#include <NDArrayFactory.h>
+#include <ops/declarable/PlatformHelper.h>
+#include <ops/declarable/CustomOperations.h>
+#include <execution/Engine.h>
+
+#ifdef HAVE_CUDNN
+
+#include <ops/declarable/platform/cudnn/cudnnUtils.h>
+
+#endif
+
+using namespace nd4j;
+
+class CuDnnTests : public testing::Test {
+public:
+
+};
+
+static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*> helpers) {
+
+    for (auto v:helpers) {
+        nd4j_printf("Initialized [%s]\n", v->name().c_str());
+    }
+}
+
+
+TEST_F(CuDnnTests, helpers_includer) {
+    // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
+#ifdef HAVE_CUDNN
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CUDA conv2d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CUDA conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_conv3dnew_ENGINE_CUDA conv3dnew;
+    nd4j::ops::platforms::PLATFORM_conv3dnew_bp_ENGINE_CUDA conv3dnew_bp;
+    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_ENGINE_CUDA depthwise_conv2d;
+    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_bp_ENGINE_CUDA depthwise_conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CUDA batchnorm;
+
+    printer({&conv2d});
+    printer({&conv2d_bp});
+    printer({&conv3dnew});
+    printer({&conv3dnew_bp});
+    printer({&depthwise_conv2d});
+    printer({&depthwise_conv2d_bp});
+    printer({&batchnorm});
+#endif
+}
+
+
+TEST_F(CuDnnTests, mixed_helpers_test_1) {
+#if defined(HAVE_CUDNN) && defined (HAVE_MKLDNN)
+    nd4j_printf("Mixed platforms test\n", "");
+
+
+    int bS=2, iH=4,iW=3,  iC=4,oC=3,  kH=3,kW=2,  sH=1,sW=1,  pH=0,pW=0,  dH=1,dW=1;
+    int       oH=2,oW=2;
+    int paddingMode = 0;             // 1-SAME, 0-VALID;
+    int dataFormat  = 0;             // 1-NHWC, 0-NCHW
+
+    auto input    = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
+    auto weights  = NDArrayFactory::create<float>('c', {oC, iC, kH, kW});
+    auto bias     = NDArrayFactory::create<float>('c', {oC}, {1,2,3});
+
+    auto expOutput = NDArrayFactory::create<float>('c', {bS, oC, oH, oW}, {61.f,   61.f,  61.f,   61.f, 177.2f,  177.2f, 177.2f,  177.2f, 293.4f,  293.4f, 293.4f,  293.4f,  61.f,   61.f,  61.f,   61.f, 177.2f,  177.2f, 177.2f,  177.2f, 293.4f,  293.4f, 293.4f,  293.4f});
+    auto zCUDA = expOutput.like();
+    auto zMKL = expOutput.like();
+
+    input = 2.;
+    weights.linspace(0.1, 0.1);
+    weights.permutei({2,3,1,0});
+
+    input.syncToHost();
+    weights.syncToHost();
+    bias.syncToHost();
+
+    nd4j::ops::conv2d op;
+
+    // cuDNN part
+    Context cuda(1);
+    cuda.setTargetEngine(samediff::Engine::ENGINE_CUDA);
+    cuda.setInputArray(0, &input);
+    cuda.setInputArray(1, &weights);
+    cuda.setInputArray(2, &bias);
+    cuda.setOutputArray(0, &zCUDA);
+    cuda.setIArguments({kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    auto statusCUDA = op.execute(&cuda);
+
+    ASSERT_EQ(Status::OK(), statusCUDA);
+    ASSERT_EQ(expOutput, zCUDA);
+
+    // MKL-DNN part
+    Context mkl(1);
+    mkl.setTargetEngine(samediff::Engine::ENGINE_CPU);
+    mkl.setInputArray(0, &input);
+    mkl.setInputArray(1, &weights);
+    mkl.setInputArray(2, &bias);
+    mkl.setOutputArray(0, &zMKL);
+    mkl.setIArguments({kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    auto statusMKL = op.execute(&mkl);
+
+    zMKL.tickWriteHost();
+
+    ASSERT_EQ(Status::OK(), statusMKL);
+    ASSERT_EQ(expOutput, zMKL);
+#endif
+}
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 30c645785..689969543 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -3280,209 +3280,6 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) {
     delete results;
 }
 
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test1) {
-
-    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test2) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {4});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {4});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {4});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {4});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
-                                            1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
-
-    input.linspace(0.1, 0.1);
-    mean.assign(1.);
-    variance.assign(0.5);
-    gamma.assign(1.2);
-    beta.assign(1.);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test3) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
-                                            0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test4) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
-                                            0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test5) {
-
-    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,4,2,2}, { 11.612187f,  11.442483f,  11.272779f,  11.103076f,  18.990039f,  19.145418f,  19.300796f,  19.456175f,  -9.557284f,  -9.704856f,  -9.852428f, -10.f, -20.f,
-                                      -19.856981f, -19.713963f, -19.570944f,   8.896924f,   8.727221f,   8.557517f,   8.387813f,  21.476097f,  21.631475f,  21.786854f,  21.942233f, -11.918438f,
-                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test6) {
-
-    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,2,2,4}, {11.612187f,  18.523903f,  -8.671851f, -21.287169f,  10.933372f,  19.145418f,  -9.262139f, -20.715094f,  10.254556f,  19.766932f,  -9.852428f, -20.143019f,   9.57574f,
-                                      20.388447f, -10.442716f, -19.570944f,   8.896924f,  21.009961f, -11.033005f, -18.998869f,   8.218109f,  21.631475f, -11.623294f, -18.426794f,   7.539293f,  22.25299f,
-                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test7) {
-
-    NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
-    input2.permutei({0,3,1,2});
-
-    NDArray mean    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
-
-    NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-
-    input1.linspace(-1012, 1);
-    input2.assign(input1);
-
-    nd4j::ops::batchnorm op;
-
-    auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
-    ASSERT_EQ(ND4J_STATUS_OK, res1);
-
-    auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
-    ASSERT_EQ(ND4J_STATUS_OK, res2);
-
-    ASSERT_TRUE(out1.equalsTo(out2));
-}
-
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, bool_broadcast_test_1) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
index c95599ff3..ee569a07c 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@@ -38,6 +38,19 @@ public:
     }
 };
 
+template <typename T>
+class TypedDeclarableOpsTests13 : public testing::Test {
+public:
+
+    TypedDeclarableOpsTests13() {
+        printf("\n");
+        fflush(stdout);
+    }
+};
+
+typedef ::testing::Types<double, float> TestingTypes;
+TYPED_TEST_CASE(TypedDeclarableOpsTests13, TestingTypes);
+
 TEST_F(DeclarableOpsTests13, test_pow_1) {
     auto x = NDArrayFactory::create<float>('c', {2, 2}, {2.f, 2.f, 2.f, 2.f});
     auto y = NDArrayFactory::create<int>('c', {2}, {3, 3});
@@ -1948,3 +1961,289 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
 }
 
 
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test1) {
+
+    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test2) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {4});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {4});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {4});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {4});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
+                                            1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
+
+    input.linspace(0.1, 0.1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(1.);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test3) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
+                                            0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test4) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
+                                            0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test5) {
+
+    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,4,2,2}, { 11.612187f,  11.442483f,  11.272779f,  11.103076f,  18.990039f,  19.145418f,  19.300796f,  19.456175f,  -9.557284f,  -9.704856f,  -9.852428f, -10.f, -20.f,
+                                      -19.856981f, -19.713963f, -19.570944f,   8.896924f,   8.727221f,   8.557517f,   8.387813f,  21.476097f,  21.631475f,  21.786854f,  21.942233f, -11.918438f,
+                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test6) {
+
+    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,2,2,4}, {11.612187f,  18.523903f,  -8.671851f, -21.287169f,  10.933372f,  19.145418f,  -9.262139f, -20.715094f,  10.254556f,  19.766932f,  -9.852428f, -20.143019f,   9.57574f,
+                                      20.388447f, -10.442716f, -19.570944f,   8.896924f,  21.009961f, -11.033005f, -18.998869f,   8.218109f,  21.631475f, -11.623294f, -18.426794f,   7.539293f,  22.25299f,
+                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test7) {
+
+    NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+    NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
+    input2.permutei({0,3,1,2});
+
+    NDArray mean    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
+
+    NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+    NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+
+    input1.linspace(-1012, 1);
+    input2.assign(input1);
+
+    nd4j::ops::batchnorm op;
+
+    auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, res1);
+
+    auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, res2);
+
+    ASSERT_TRUE(out1.equalsTo(out2));
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test8) {
+
+    NDArray input('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
+
+    NDArray mean    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,3,4,5}, {-105.019394, -103.322357, -101.625313, -99.928276, -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922,
+                        -82.957886, -81.260841, -79.563805, -77.866768, -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414,
+                        -60.896374, -59.199333, -57.502296, -55.805256, -54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902,
+                        -38.834862, -37.137825, -35.440784, -33.743744, -32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392,
+                        -16.773354, -15.076314, -13.379274, -11.682236, -9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157,
+                        6.985196, 8.682236, 10.379274, 12.076314, 13.773354, 15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744,
+                        32.440784, 34.137825, 35.834862, 37.531902, 39.228943, 40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333,
+                        57.896374, 59.593414, 61.290451, 62.987488, 64.684532, 66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922,
+                        83.351959, 85.049004, 86.746040, 88.443077, 90.140121, 91.837158, 93.534195, 95.231239, 96.928276}, nd4j::DataType::FLOAT32);
+
+    input.linspace(-60, 1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(-1.5);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test9) {
+
+    NDArray input('c', {2,3,3,3,3}, nd4j::DataType::FLOAT32);
+
+    NDArray mean    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,3,3,3,3}, {-138.960175, -137.263138, -135.566101, -133.869064, -132.172028, -130.474976, -128.777954, -127.080902, -125.383865, -123.686829, -121.989784, -120.292747,
+                            -118.595711, -116.898666, -115.201630, -113.504593, -111.807549, -110.110512, -108.413475, -106.716431, -105.019394, -103.322357, -101.625313, -99.928276,
+                            -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922, -82.957886, -81.260841, -79.563805, -77.866768,
+                            -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414, -60.896374, -59.199333, -57.502296, -55.805256,
+                            -54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902, -38.834862, -37.137825, -35.440784, -33.743744,
+                            -32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392, -16.773354, -15.076314, -13.379274, -11.682236,
+                            -9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157, 6.985196, 8.682236, 10.379274, 12.076314, 13.773354,
+                            15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744, 32.440784, 34.137825, 35.834862, 37.531902, 39.228943,
+                            40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333, 57.896374, 59.593414, 61.290451, 62.987488, 64.684532,
+                            66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922, 83.351959, 85.049004, 86.746040, 88.443077, 90.140121,
+                            91.837158, 93.534195, 95.231239, 96.928276, 98.625313, 100.322357, 102.019394, 103.716431, 105.413475, 107.110512, 108.807549, 110.504593, 112.201630, 113.898666,
+                            115.595711, 117.292747, 118.989784, 120.686829, 122.383865, 124.080902, 125.777946, 127.474976, 129.172028, 130.869064, 132.566101, 134.263138}, nd4j::DataType::FLOAT32);
+
+    input.linspace(-80, 1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(-1.5);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3,4});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShape(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
index 829117bed..d83e85f67 100644
--- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
@@ -45,26 +45,26 @@ static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*>
 TEST_F(MklDnnTests, helpers_includer) {
     // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
 #ifdef HAVE_MKLDNN
-    nd4j::ops::platforms::PLATFORM_conv2d conv2d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_conv2d conv3d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp conv3d_bp;
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv3d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_avgpool2d avgpool2d;
-    nd4j::ops::platforms::PLATFORM_avgpool2d_bp avgpool2d_bp;
+    nd4j::ops::platforms::PLATFORM_avgpool2d_ENGINE_CPU avgpool2d;
+    nd4j::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CPU avgpool2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_maxpool2d maxpool2d;
-    nd4j::ops::platforms::PLATFORM_maxpool2d_bp maxpool2d_bp;
+    nd4j::ops::platforms::PLATFORM_maxpool2d_ENGINE_CPU maxpool2d;
+    nd4j::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CPU maxpool2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew avgpool3d;
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp avgpool3d_bp;
+    nd4j::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CPU avgpool3d;
+    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CPU avgpool3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew maxpool3d;
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp maxpool3d_bp;
+    nd4j::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CPU maxpool3d;
+    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_lrn lrn;
-    nd4j::ops::platforms::PLATFORM_batchnorm batchnorm;
+    nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
+    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;
 
     printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm});
 #endif
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
index 3cdd8f70a..c6c0a1bd8 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
@@ -247,8 +247,10 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) {
     auto res = cudaStreamSynchronize(*stream);
     ASSERT_EQ(0, res);
     //double* localBuffer = ;
+    z.syncToHost();
     cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost);
     res = cudaStreamSynchronize(*stream);
+    z.tickWriteHost();
     ASSERT_EQ(0, res);
 
     //
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index 07cae9ae3..fbba329e3 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -150,7 +150,7 @@ if ("${EXPERIMENTAL}" STREQUAL "yes")
 endif()
 
 # tests are always compiled with all ops included
-SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true")
+SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DDEFAULT_ENGINE=samediff::ENGINE_CPU")
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     # using Clang
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
index 7b649b488..aa6d91519 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
@@ -38,6 +38,7 @@ import org.bytedeco.javacpp.tools.InfoMapper;
                         "array/ConstantDataBuffer.h",
                         "array/TadPack.h",
                         "execution/ErrorReference.h",
+                        "execution/Engine.h",
                         "memory/MemoryType.h",
                         "Environment.h",
                         "types/utf8string.h",
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
index ec5c25d86..c2fca8d89 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
@@ -41,6 +41,7 @@ import java.util.Scanner;
                                               "array/ConstantDescriptor.h",
                                               "array/TadPack.h",
                                               "execution/ErrorReference.h",
+                                              "execution/Engine.h",
                                               "Environment.h",
                                               "types/utf8string.h",
                                               "NativeOps.h",