cuDNN integration (#150)

* initial commit Signed-off-by: raver119 <raver119@gmail.com> * one file Signed-off-by: raver119 <raver119@gmail.com> * few more includes Signed-off-by: raver119 <raver119@gmail.com> * m? Signed-off-by: raver119 <raver119@gmail.com> * const Signed-off-by: raver119 <raver119@gmail.com> * cudnn linkage in tests Signed-off-by: raver119 <raver119@gmail.com> * culibos Signed-off-by: raver119 <raver119@gmail.com> * static reminder Signed-off-by: raver119 <raver119@gmail.com> * platform engine tag Signed-off-by: raver119 <raver119@gmail.com> * HAVE_CUDNN moved to config.h.in Signed-off-by: raver119 <raver119@gmail.com> * include Signed-off-by: raver119 <raver119@gmail.com> * include Signed-off-by: raver119 <raver119@gmail.com> * skip cudnn handle creation if there's not cudnn Signed-off-by: raver119 <raver119@gmail.com> * meh Signed-off-by: raver119 <raver119@gmail.com> * target device in context Signed-off-by: raver119 <raver119@gmail.com> * platform engines Signed-off-by: raver119 <raver119@gmail.com> * platform engines Signed-off-by: raver119 <raver119@gmail.com> * allow multiple -h args Signed-off-by: raver119 <raver119@gmail.com> * allow multiple -h args Signed-off-by: raver119 <raver119@gmail.com> * move mkldnn out of CPU block Signed-off-by: raver119 <raver119@gmail.com> * link to mkldnn on cuda Signed-off-by: raver119 <raver119@gmail.com> * less prints Signed-off-by: raver119 <raver119@gmail.com> * minor tweaks Signed-off-by: raver119 <raver119@gmail.com> * next step Signed-off-by: raver119 <raver119@gmail.com> * conv2d NCHW draft Signed-off-by: raver119 <raver119@gmail.com> * conv2d biasAdd Signed-off-by: raver119 <raver119@gmail.com> * test for MKL/CUDNN combined use Signed-off-by: raver119 <raver119@gmail.com> * - provide additional code for conv2d ff based on cudnn api, not tested yet Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on conv2d helper based on using cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing several cuda bugs which appeared after cudnn lib had been started to use Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of conv2d backprop op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - implementaion of conv3d and conv3d_bp ops based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - bugs fixing in conv3d/conv3d_bp ops (cudnn in use) Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of depthwiseConv2d (ff/bp) op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of batchnorm ff op based on cudnn api Signed-off-by: Yurii <iuriish@yahoo.com> * - disable cudnn batchnorm temporary Signed-off-by: Yurii <iuriish@yahoo.com> * - add minor change in cmake Signed-off-by: Yurii <iuriish@yahoo.com> * engine for depthwise mkldnn Signed-off-by: raver119 <raver119@gmail.com> * couple of includes Signed-off-by: raver119 <raver119@gmail.com> * - provide permutation to cudnn batchnorm ff when format is NHWC Signed-off-by: Yurii <iuriish@yahoo.com> * lgamma fix Signed-off-by: raver119 <raver119@gmail.com> * - eliminate memory leak in two tests Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: Yurii Shyrma <iuriish@yahoo.com>
2020-01-20 21:32:46 +03:00 · 2020-01-20 21:32:46 +03:00 · 7783012f39
commit 7783012f39
parent 8fc0e63ce7
65 changed files with 2810 additions and 435 deletions
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -14,6 +14,10 @@ set(CMAKE_CXX_STANDARD 11)
 if (CUDA_BLAS)
    enable_language(CUDA)
    set(CMAKE_CUDA_STANDARD 11)
+
+    set(DEFAULT_ENGINE "samediff::ENGINE_CUDA")
+else()
+    set(DEFAULT_ENGINE "samediff::ENGINE_CPU")
 endif()

 # MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
@ -60,6 +64,7 @@ if(NATIVE)
    ENDIF()
 endif()

+
 if(NOT CUDA_BLAS)
    # we need this definition to avoid global memory use within mkldnn
    add_definitions(-DDNNL_ENABLE_CONCURRENT_EXEC=true)
@ -128,36 +133,70 @@ if(NOT CUDA_BLAS)
        include_directories(${CPUF_SOURCE_DIR}/include)
        set(CPU_FEATURES cpu_features)
    endif()
+endif()

-    # new mkl-dnn entry
-    if (${HELPERS_mkldnn})
-        message("Going to pull & build mkldnn")
-        set(HAVE_MKLDNN 1)
-        set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE)

-        configure_file(./CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt)
-        execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-                    RESULT_VARIABLE result
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
-        if(result)
-            message(FATAL_ERROR "CMake step for mkldnn failed: ${result}")
-        endif()
-        execute_process(COMMAND ${CMAKE_COMMAND} --build .
-                    RESULT_VARIABLE result
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
-        if(result)
-            message(FATAL_ERROR "Build step for mkldnn failed: ${result}")
-        endif()
+# new mkl-dnn entry
+if (${HELPERS_mkldnn})
+    message("Going to pull & build mkldnn")
+    set(HAVE_MKLDNN 1)
+    set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE)

-        add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src
-                    ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build
-                    EXCLUDE_FROM_ALL)
+    configure_file(./CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
+    if(result)
+        message(FATAL_ERROR "CMake step for mkldnn failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download )
+    if(result)
+        message(FATAL_ERROR "Build step for mkldnn failed: ${result}")
+    endif()

-        set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build)
-        set(mkldnn_EXT_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src)
-        set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}")
-        include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR})
-        set(MKLDNN dnnl)
+    add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src
+            ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build
+            EXCLUDE_FROM_ALL)
+
+    set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build)
+    set(mkldnn_EXT_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src)
+    set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}")
+    include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR})
+    set(MKLDNN dnnl)
+endif()
+
+
+if (${HELPERS_cudnn})
+    if (NOT CUDA_BLAS)
+        message(FATAL_ERROR "Can't build cuDNN on non-CUDA platform")
+    endif()
+
+    set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
+
+    # FIXME: we don't want static library in master
+    SET(CUDNN_LIBNAME "cudnn")
+    SET(CULIBOS_LIBNAME "culibos")
+
+    find_path(CUDNN_INCLUDE_DIR cudnn.h
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES cuda/include include)
+
+    find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+    find_library(CULIBOS_LIBRARY ${CULIBOS_LIBNAME}
+            HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+            PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+
+    if (CUDNN_LIBRARY)
+        set(HAVE_CUDNN true)
+        set(CUDNN ${CUDNN_LIBRARY} ${CULIBOS_LIBRARY})
+    else()
+        message(FATAL_ERROR "Unable to find cuDNN")
    endif()
 endif()

@ -185,6 +224,8 @@ set(HAVE_FLATBUFFERS 1)
 set(FLATBUFFERS_PATH ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src)
 include_directories(${FLATBUFFERS_PATH}/include)

+
+
 configure_file(include/config.h.in include/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)

--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -131,6 +131,11 @@ if(!CUDA_BLAS)
    endif()
 endif()

+#if MKLDNN is enabled - we're building mkldnn-powered helpers
+if (HAVE_MKLDNN)
+    file(GLOB_RECURSE CUSTOMOPS_MKLDNN_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
+endif()
+
 if(CUDA_BLAS)
    message("Build cublas")
    find_package(CUDA)
@ -210,16 +215,23 @@ if(CUDA_BLAS)
        file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
        file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)

+        if (HAVE_CUDNN)
+            message("cuDNN included")
+            file(GLOB_RECURSE CUSTOMOPS_CUDNN_SOURCES false ../include/ops/declarable/platform/cudnn/*.cu)
+        endif()
+
 		add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
                ${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
                ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
                cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
                Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
+                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})

        add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)

-        message("MSVC runtime for library: ${MSVC_RT_LIB}")
+        if (WIN32)
+            message("MSVC runtime for library: ${MSVC_RT_LIB}")
+        endif()

        # static library is built only if we're going to build tests, skip otherwise
        if (BUILD_TESTS)
@ -237,7 +249,7 @@ if(CUDA_BLAS)
            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
        endif()

-		target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY})
+		target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN})
 	    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)

 		install(TARGETS ${LIBND4J_NAME} DESTINATION .)
@ -264,12 +276,6 @@ elseif(CPU_BLAS)
    file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/*.cpp ../include/helpers/*.h)
    file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/*.cpp ../include/loops/*.h)

-
-    #if MKLDNN is enabled - we're building mkldnn-powered helpers
-    if (HAVE_MKLDNN)
-        file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
-    endif()
-
    if (X86_BUILD)
        # we disable platform optimizations for certains files for linux/macos
        set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
@ -282,7 +288,7 @@ elseif(CPU_BLAS)
            cpu/NativeOpExecutioner.cpp cpu/NDArray.cpp cpu/NDArrayFactory.cpp
            ../include/cnpy/cnpy.cpp  ../include/nd4jmemset.h ../include/nd4jmalloc.h
            Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
+            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
            ${OPS_SOURCES} ${PERF_SOURCES})
    if(IOS)
        add_library(${LIBND4J_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@ -940,6 +940,7 @@ namespace nd4j {
        template <typename T>
        std::vector<T> getBufferAsVector();
        std::vector<Nd4jLong> getShapeAsVector() const;
+        std::vector<int> getShapeAsVectorInt() const;
        std::vector<Nd4jLong> getShapeInfoAsVector();
        std::vector<int64_t> getShapeInfoAsFlatVector();
        std::vector<int64_t> getShapeAsFlatVector();
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@ -444,6 +444,16 @@ std::vector<Nd4jLong> NDArray::getShapeAsVector() const {
    return vector;
 }

+////////////////////////////////////////////////////////////////////////
+std::vector<int> NDArray::getShapeAsVectorInt() const {
+
+    std::vector<int> vector(this->rankOf());
+    for (int e = 0; e < this->rankOf(); e++)
+        vector[e] = static_cast<int>(this->sizeAt(e));
+
+    return vector;
+}
+
 ////////////////////////////////////////////////////////////////////////
 std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() {
    int magicNumber = shape::shapeInfoLength(this->rankOf());
@ -625,7 +635,7 @@ void NDArray::assign(const NDArray& other, bool allowParallelism) {
        if (other.lengthOf() != lengthOf()) {
            auto shapeThis = ShapeUtils::shapeAsString(this);
            auto shapeThat = ShapeUtils::shapeAsString(&other);
-            nd4j_printf("Can't assign new value to the array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
+            nd4j_printf("Can't assign array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
            throw std::runtime_error("NDArray::assign: lengths of arrays are mismatched");
        }

--- a/libnd4j/blas/cuda/NativeOpExecutioner.cu
+++ b/libnd4j/blas/cuda/NativeOpExecutioner.cu
@ -488,7 +488,7 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
        throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);

    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 8192);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);

    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);

@ -523,7 +523,7 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,

    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);

@ -559,7 +559,7 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,

    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);

@ -601,7 +601,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
 	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 	auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    if (zType != nd4j::DataType::INT64 && zType != nd4j::DataType::INT32)
        throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
@ -647,7 +647,7 @@ void  NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,

    auto xRank = shape::rank(hXShapeInfo);
    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);

@ -684,7 +684,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

 	if (nd4j::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
 		printf("AF1 opNum:[%i]\n", opNum);
@ -734,7 +734,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);

@ -766,7 +766,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);

@ -797,7 +797,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);

@ -828,7 +828,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);

@ -1085,7 +1085,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,

    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    if (xType != yType)
        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
@ -1135,7 +1135,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,


    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
                                                                    dX, dXShapeInfo,
@ -1177,7 +1177,7 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
    auto xLength = shape::length(hXShapeInfo);
    auto blockWidth = 256;
    auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
-    dim3 launchDims(numBlocks, blockWidth, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);

    if (xType != yType)
        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
@ -1595,7 +1595,7 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);

    auto numBlocks = shape::length(hZShapeInfo);
-    dim3 launchDims(numBlocks, 256, 32768);
+    dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);

    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);

--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@ -489,6 +489,7 @@ mkbuilddir() {
    cd "blasbuild/$CHIP"
 }

+HELPERS=""
 if [ "$HELPER" == "" ]; then
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
  echo "!!                                                                                                           !!"
@ -503,6 +504,14 @@ if [ "$HELPER" == "" ]; then
  echo "!!                                                                                                           !!"
  echo "!!                                                                                                           !!"
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+else
+  #  if helpers were defined, we'll propagate them to CMake
+  IFS=','
+  read -ra HLP <<< "$HELPER"
+  for i in "${HLP[@]}"; do
+    HELPERS="${HELPERS} -DHELPERS_$i=true"
+  done
+  IFS=' '
 fi

 echo PACKAGING  = "${PACKAGING}"
@ -519,10 +528,10 @@ echo MINIFIER = "${MINIFIER_ARG}"
 echo TESTS = "${TESTS_ARG}"
 echo NAME = "${NAME_ARG}"
 echo OPENBLAS_PATH = "$OPENBLAS_PATH"
-echo HELPERS = "$HELPER"
+echo HELPERS = "$HELPERS"
 mkbuilddir
 pwd
-eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DHELPERS_"$HELPER"=true "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
 if [ "$PARALLEL" == "true" ]; then
    MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
 fi
--- a/libnd4j/include/config.h.in
+++ b/libnd4j/include/config.h.in
@ -13,4 +13,8 @@

 #cmakedefine FLATBUFFERS_PATH "@FLATBUFFERS_PATH@"

+#cmakedefine HAVE_CUDNN
+
+#cmakedefine DEFAULT_ENGINE @DEFAULT_ENGINE@
+
 #endif
--- a/libnd4j/include/execution/Engine.h
+++ b/libnd4j/include/execution/Engine.h
@ -0,0 +1,31 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SD_ENGINE_H
+#define SD_ENGINE_H
+
+namespace samediff {
+    enum Engine {
+        ENGINE_CPU = 0,
+        ENGINE_CUDA = 1,
+    };
+}
+
+#endif //SD_ENGINE_H
--- a/libnd4j/include/execution/Executor.h
+++ b/libnd4j/include/execution/Executor.h
@ -18,8 +18,8 @@
 // @author raver119@gmail.com
 //

-#ifndef DEV_TESTS_EXECUTOR_H
-#define DEV_TESTS_EXECUTOR_H
+#ifndef SD_EXECUTOR_H
+#define SD_EXECUTOR_H

 namespace nd4j {
    class Executor {
@ -30,4 +30,4 @@ namespace nd4j {
    };
 }

-#endif //DEV_TESTS_EXECUTOR_H
+#endif //SD_EXECUTOR_H
--- a/libnd4j/include/execution/LaunchContext.h
+++ b/libnd4j/include/execution/LaunchContext.h
@ -27,6 +27,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <cuda_device_runtime_api.h>
+#include "config.h"
 #endif

 // used for MKLDNN etc
@ -81,6 +82,7 @@ class ND4J_EXPORT LaunchContext {
 		int* getAllocationPointer() const;
 		void* getCublasHandle() const;
 		void* getCusolverHandle() const;
+		void* getCuDnnHandle() const;
 		cudaStream_t* getCudaStream() const;
 		cudaStream_t* getCudaSpecialStream() const;

--- a/libnd4j/include/execution/cuda/LaunchContext.cu
+++ b/libnd4j/include/execution/cuda/LaunchContext.cu
@ -166,6 +166,10 @@ LaunchContext::LaunchContext() {
        return contextBuffers.isInitialized();
    }

+    void* LaunchContext::getCuDnnHandle() const {
+        return CublasHelper::getInstance()->cudnn();
+    }
+
    sd::ErrorReference* LaunchContext::errorReference() {
        return contextBuffers.errorReference();
    }
--- a/libnd4j/include/graph/Context.h
+++ b/libnd4j/include/graph/Context.h
@ -27,6 +27,7 @@
 #include <graph/VariableSpace.h>
 #include <graph/ContextPrototype.h>
 #include <memory/Workspace.h>
+#include <execution/Engine.h>

 // CUDA-specific includes
 #ifdef __CUDACC__
@ -102,12 +103,13 @@ namespace nd4j {
            // this method returns workspace for object allocations
            nd4j::memory::Workspace* oWorkspace();

-
            void setVariableSpace(VariableSpace* variableSpace);

            nd4j::random::RandomBuffer* getRNG();
            void setRNG(nd4j::random::RandomBuffer* rng);

+            void setTargetEngine(samediff::Engine engine);
+
            VariableSpace *getVariableSpace();

            LaunchContext* launchContext();
--- a/libnd4j/include/graph/ContextPrototype.h
+++ b/libnd4j/include/graph/ContextPrototype.h
@ -27,6 +27,11 @@
 #include <dll.h>
 #include <RandomGenerator.h>
 #include <ops/declarable/OpDescriptor.h>
+#include <execution/Engine.h>
+
+#ifndef __STANDALONE_BUILD__
+#include <config.h>
+#endif

 namespace nd4j {
    namespace graph {
@ -53,6 +58,8 @@ namespace nd4j {
            nd4j::ops::OpDescriptor* _opDescriptor;
            bool _useMKLDNN = nd4j::Environment::getInstance()->isUseMKLDNN();

+            // target engine for execution
+            samediff::Engine _engine = DEFAULT_ENGINE;
        public:
            explicit ContextPrototype(nd4j::ops::OpDescriptor* opDescriptor = nullptr, int nodeId = 1, bool inPlace = false);
            ~ContextPrototype() = default;
@ -84,6 +91,8 @@ namespace nd4j {
            std::vector<bool>* getBArguments();
            std::vector<int>* getAxis();

+            samediff::Engine engine();
+
            size_t numT();
            size_t numI();
            size_t numB();
--- a/libnd4j/include/graph/impl/Context.cpp
+++ b/libnd4j/include/graph/impl/Context.cpp
@ -107,6 +107,10 @@ namespace nd4j {
                delete _context;
        }

+        void Context::setTargetEngine(samediff::Engine engine) {
+            _engine = engine;
+        }
+
        bool Context::hasWorkspaceProvided() {
            return this->_workspace != nullptr;
        }
--- a/libnd4j/include/graph/impl/ContextPrototype.cpp
+++ b/libnd4j/include/graph/impl/ContextPrototype.cpp
@ -59,6 +59,10 @@ namespace nd4j {
            }
        }

+        samediff::Engine ContextPrototype::engine() {
+            return _engine;
+        }
+
        bool ContextPrototype::hasVariablesFilled() {
            return this->_inputs.size() > 0;
        }
--- a/libnd4j/include/helpers/cublasHelper.h
+++ b/libnd4j/include/helpers/cublasHelper.h
@ -34,12 +34,14 @@ namespace nd4j {

        std::vector<void*> _cache;
        std::vector<void*> _solvers;
+        std::vector<void*> _cudnn;

        CublasHelper();
        ~CublasHelper();
    public:
        static CublasHelper* getInstance();

+        void* cudnn();
        void* solver();

        void* handle();
--- a/libnd4j/include/helpers/cuda_off/cublasHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/cublasHelper.cu
@ -25,6 +25,13 @@
 #include <exceptions/cuda_exception.h>
 #include <helpers/logger.h>
 #include <execution/AffinityManager.h>
+#include "config.h"
+
+#ifdef HAVE_CUDNN
+
+#include <cudnn.h>
+
+#endif

 namespace nd4j {
    std::mutex CublasHelper::_mutex;
@ -47,6 +54,18 @@ namespace nd4j {
        return cusolverH;
    }

+    static void* cudnn_() {
+#ifdef HAVE_CUDNN
+        auto cudnnH = new cudnnHandle_t();
+        auto status = cudnnCreate(cudnnH);
+        if (status != CUDNN_STATUS_SUCCESS)
+            throw cuda_exception::build("cuDNN handle creation failed !", status);
+
+        return cudnnH;
+#endif
+        return nullptr;
+    }
+
    static void destroyHandle_(void* handle) {
        auto ch = reinterpret_cast<cublasHandle_t *>(handle);
        auto status = cublasDestroy_v2(*ch);
@ -62,11 +81,13 @@ namespace nd4j {
        auto currentDevice = AffinityManager::currentDeviceId();
        _cache.resize(numDevices);
        _solvers.resize(numDevices);
+        _cudnn.resize(numDevices);
        for (int e = 0; e < numDevices; e++) {
            AffinityManager::setCurrentNativeDevice(e);

            _cache[e] = handle_();
            _solvers[e] = solver_();
+            _cudnn[e] = cudnn_();
        }

        // don't forget to restore back original device
@ -90,6 +111,14 @@ namespace nd4j {
        return _INSTANCE;
    }

+    void* CublasHelper::cudnn() {
+        auto deviceId = AffinityManager::currentDeviceId();
+        if (deviceId < 0 || deviceId > _cudnn.size())
+            throw cuda_exception::build("requested deviceId doesn't look valid", deviceId);
+
+        return _cudnn[deviceId];
+    }
+
    void* CublasHelper::handle() {
        auto deviceId = AffinityManager::currentDeviceId();
        return handle(deviceId);
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@ -35,12 +35,12 @@ static __global__ void simpleIndexReduceGeneric(const int op,
                                           Nd4jLong *xShapeInfo, int xRank,
                                           void *extraParams,
                                           void *result,
-                                           Nd4jLong *resultShapeInfo, int zRank,
+                                           Nd4jLong *zShapeInfo, int zRank,
                                           int *dimension,
                                           int dimensionLength,
                                           int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {

-     functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,resultShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
+     functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,zShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
 }

 namespace functions {
@ -52,7 +52,7 @@ namespace functions {
                                                                void *dx, Nd4jLong *xShapeInfo,
                                                                int xRank,
                                                                void *extraParams,
-                                                                void *result, Nd4jLong *resultShapeInfo,
+                                                                void *result, Nd4jLong *zShapeInfo,
                                                                int zRank,
                                                                int *dimension, int dimensionLength,
                                                                int postProcessOrNot,
@ -62,7 +62,7 @@ namespace functions {
            simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(opNum,
                                                                                            dx, xShapeInfo, xRank,
                                                                                            extraParams,
-                                                                                            result, resultShapeInfo, 0,
+                                                                                            result, zShapeInfo, 0,
                                                                                            nullptr, 0,
                                                                                            1,
                                                                                            allocationBuffer, reductionBuffer,
@ -70,14 +70,14 @@ namespace functions {
        }

        template <typename X, typename Z>
-        _CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
+        _CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
            simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(
 			 opNum,
 			 dx,
 			 xShapeInfo, xRank,
 			 extraParams,
 			 result,
-			 resultShapeInfo, zRank,
+			 zShapeInfo, zRank,
 			 dimension,
 			 dimensionLength,
 			 1, allocationBuffer, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
@ -158,7 +158,7 @@ namespace functions {
                Nd4jLong *xShapeInfo,
                void *extraParams,
                void *result,
-                Nd4jLong *resultShapeInfo,
+                Nd4jLong *zShapeInfo,
                int *dimension,
                int dimensionLength,
                int postProcessOrNot,
@ -166,7 +166,7 @@ namespace functions {
                void *reductionBuffer,
                Nd4jLong *tadShapeInfo,
                Nd4jLong *tadOffset) {
-             DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, resultShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
+             DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
        }


@ -174,7 +174,7 @@ namespace functions {
        template <typename OpType>
        __device__ void IndexReduce<X, Z>::transform(void *vdx, Nd4jLong *xShapeInfo,
                                                void *vextraParams,
-                                                void *vresult, Nd4jLong *resultShapeInfo,
+                                                void *vz, Nd4jLong *zShapeInfo,
                                                int *dimension, int dimensionLength,
                                                int postProcessOrNot,
                                                int *allocationBuffer, void *vreductionBuffer,
@ -183,7 +183,7 @@ namespace functions {
             * Gpu information for the problem
             */
            auto dx = reinterpret_cast<X*>(vdx);
-            auto result = reinterpret_cast<Z*>(vresult);
+            auto z = reinterpret_cast<Z*>(vz);
            auto extraParams = static_cast<X*>(vextraParams);
            auto reductionBuffer = static_cast<X*>(vreductionBuffer);
            auto order = shape::order(xShapeInfo);
@ -203,19 +203,19 @@ namespace functions {
            //length for the tad
            __shared__ volatile Nd4jLong xLength;

-            __shared__ volatile Nd4jLong resultLength;
+            __shared__ volatile Nd4jLong zLen;


            //only compute the tad indexes once
            IndexValue<X> reduction = OpType::startingIndexValue(dx);

            if (threadIdx.x == 0) {
-                if (resultShapeInfo != nullptr)
-                    resultLength = shape::length(resultShapeInfo);
-                else resultLength = 1;
+                if (zShapeInfo != nullptr)
+                    zLen = shape::length(zShapeInfo);
+                else zLen = 1;

                if (dimensionLength == 1) {
-                    if (resultLength == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
+                    if (zLen == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
                        resultScalar = 1;
                    else
                        resultScalar = 0;
@ -223,13 +223,24 @@ namespace functions {
                else
                    resultScalar = 0;

-                if (resultLength == 1)
+                if (zLen == 1)
                    resultScalar = 1;

                xLength = shape::length(xShapeInfo);
            }
            __syncthreads();

+            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
+
+                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                    return;
+
+                for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x)
+                    z[i] = (Z) reduction.index;
+
+                return;
+            }
+
            if (!resultScalar) {

                __shared__ Nd4jLong tadLength;
@ -261,7 +272,7 @@ namespace functions {

                        __syncthreads();
                        if (threadIdx.x == 0) {
-                            result[r] = (Z) sPartials[threadIdx.x].index;
+                            z[r] = (Z) sPartials[threadIdx.x].index;
                        }
                        __syncthreads();
                    }
@ -282,7 +293,7 @@ namespace functions {

                        __syncthreads();
                        if (threadIdx.x == 0) {
-                            result[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
+                            z[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
                        }
                        __syncthreads();
                    }
@ -345,14 +356,14 @@ namespace functions {

                        __syncthreads();
                        if (tid == 0) {
-                            result[0] = (Z) sPartials[0].index;
+                            z[0] = (Z) sPartials[0].index;
                        }
                    }
                } else {
                    if (tid == 0) {
                        auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
                        tc[16384] = 0;
-                        result[0] = (Z) sPartials[0].index;
+                        z[0] = (Z) sPartials[0].index;
                    }
                }

--- a/libnd4j/include/memory/cuda/Workspace.cu
+++ b/libnd4j/include/memory/cuda/Workspace.cu
@ -143,7 +143,7 @@ namespace nd4j {
                cudaFreeHost((void *)this->_ptrHost);

            if (this->_allocatedDevice && !_externalized)
-                cudaFree((void *)this->_ptrHost);
+                cudaFree((void *)this->_ptrDevice);

            freeSpills();
        }
--- a/libnd4j/include/ops/declarable/OpRegistrator.h
+++ b/libnd4j/include/ops/declarable/OpRegistrator.h
@ -27,6 +27,7 @@
 #include <mutex>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/PlatformHelper.h>
+#include <execution/Engine.h>

 // handlers part
 #include <cstdlib>
@ -66,8 +67,8 @@ namespace nd4j {
            std::vector<nd4j::ops::DeclarableOp *> _uniqueD;

            // pointers to platform-specific helpers
-            std::map<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
-            std::map<std::string, nd4j::ops::platforms::PlatformHelper*> _helpersH;
+            std::map<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
+            std::map<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersH;
            std::vector<nd4j::ops::platforms::PlatformHelper*> _uniqueH;

            std::mutex _locker;
@ -98,13 +99,13 @@ namespace nd4j {

            void registerHelper(nd4j::ops::platforms::PlatformHelper* op);

-            bool hasHelper(Nd4jLong hash);
+            bool hasHelper(Nd4jLong hash, samediff::Engine engine);

            nd4j::ops::DeclarableOp* getOperation(const char *name);
            nd4j::ops::DeclarableOp* getOperation(Nd4jLong hash);
            nd4j::ops::DeclarableOp* getOperation(std::string &name);

-            nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash);
+            nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash, samediff::Engine engine);

            std::vector<Nd4jLong> getAllHashes();

--- a/libnd4j/include/ops/declarable/PlatformHelper.h
+++ b/libnd4j/include/ops/declarable/PlatformHelper.h
@ -22,6 +22,7 @@
 #define SD_PLATFORMHELPER_H

 #include <ShapeUtils.h>
+#include <execution/Engine.h>
 #include <graph/Context.h>
 #include <string>
 #include <pointercast.h>
@ -35,18 +36,23 @@ namespace  nd4j {
             */
            class ND4J_EXPORT PlatformHelper {
            protected:
+                // target engine for this impl
+                samediff::Engine _engine;
+
                // name of the operation this helper is built for
                std::string _name;

                // hash of the operation this helper is built for
                Nd4jLong _hash;
            public:
-                PlatformHelper(const char *name);
+                PlatformHelper(const char *name, samediff::Engine engine);

                ~PlatformHelper() = default;

                std::string name();

+                samediff::Engine engine();
+
                Nd4jLong hash();

                /**
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@ -199,16 +199,16 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
    int dH = INT_ARG(10);                                                       // dilations height
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
-    int isNDHWC  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);

    int trueoD, trueoH, trueoW;          // true output depth/height/width
    ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);

-    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D_BP OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
    std::string expectedGradOShape   = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2}));
    std::string expectedWeightsShape = ShapeUtils::shapeAsString({kD, kH, kW, iC, oC});
    REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0,  "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str());
@ -222,7 +222,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {

    std::vector<int> gradOaxesForDot;

-    if(!isNDHWC) {
+    if(!isNCDHW) {
        gradOaxesForDot  = {0,1,2,3};                                           // bS, oD, oH, oW
        input = new NDArray(input->permute({0,4,1,2,3}));                       // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
        gradI = new NDArray(gradI->permute({0,4,1,2,3}));                       // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
@ -249,7 +249,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
    MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2,3,4,1,0,5,6,7});   // [kD, kH, kW, iC, oC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
    ConvolutionUtils::col2vol(block, columns, *gradI, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // columns [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to  [bS, iC, iD, iH, iW]

-    if(!isNDHWC) {
+    if(!isNCDHW) {
        delete input;
        delete gradI;
    }
@ -287,7 +287,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
    int dH = INT_ARG(10);                                                       // dilations height
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                               // 1-SAME,  0-VALID
-    int isNDHWC  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW

    const int rank = 5;
    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
@ -296,7 +296,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
    REQUIRE_TRUE(gradOShapeInfo[0]   == rank, 0, "CUSTOM CONV3D_BP OP: rank of output gradients (next epsilon) array must be equal to %i, but got %i instead !", rank, gradOShapeInfo);

    int indIOioC, indIiD, indWoC(4);
-    if(!isNDHWC) {
+    if(!isNCDHW) {
        indIOioC = 4; indIiD = 1;
    }
    else {
--- a/libnd4j/include/ops/declarable/helpers/convolutions.h
+++ b/libnd4j/include/ops/declarable/helpers/convolutions.h
@ -41,8 +41,10 @@ namespace nd4j {
            static inline void calcOutSizePool2D(int& oH, int& oW, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int iH, const int iW, const int paddingMode) {

                if(paddingMode == 0) {             // valid
-                    oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
-                    oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
+                    // oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
+                    // oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
+                    oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
+                    oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
                }
                else if (paddingMode == 1) {       // same
                    oH = (int) math::nd4j_ceil<double, double>(iH * 1. / sH);
@ -57,9 +59,9 @@ namespace nd4j {
            static inline void calcOutSizePool3D(int& oD, int& oH, int& oW, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int iD, const int iH, const int iW, const int paddingMode) {

                if(paddingMode == 0) {             // valid
-                    oD = (iD - (kD + (kD - 1) * (dD - 1)) + 2 * pD) / sD + 1;
-                    oH = (iH - (kH + (kH - 1) * (dH - 1)) + 2 * pH) / sH + 1;
-                    oW = (iW - (kW + (kW - 1) * (dW - 1)) + 2 * pW) / sW + 1;
+                    oD = (iD - ((kD - 1) * dD + 1) + 2 * pD) / sD + 1;
+                    oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
+                    oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
                }
                else if(paddingMode == 1) {        // same
                    oD = (int) nd4j::math::nd4j_ceil<double, double>(iD * 1. / sD);
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@ -1121,8 +1121,12 @@ namespace helpers {
        I const* cropSizes = reinterpret_cast<I const*>(cropSize->getSpecialBuffer());
        T* outBuf = reinterpret_cast<T*>(crops->specialBuffer());

+        int threadsPerBlock = math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth);
+        if(threadsPerBlock > MAX_NUM_THREADS/4)
+            threadsPerBlock = MAX_NUM_THREADS/4;
+
        NDArray::prepareSpecialUse({crops}, {images, boxes, indices, cropSize});
-        cropAndResizeKernel<T,Z,I><<<batchSize, math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth), 512, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
+        cropAndResizeKernel<T,Z,I><<<batchSize, threadsPerBlock, 256, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
                cropSizes, cropSize->getSpecialShapeInfo(), method, extrapolationVal, outBuf, crops->specialShapeInfo(), numBoxes, cropHeight, cropWidth, batchSize, imageHeight, imageWidth, depth);
        NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize});
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
@ -30,7 +30,7 @@ namespace helpers {
 //////////////////////////////////////////////////////////////////////////
 // calculate digamma function for array elements
 template <typename T>
-static void lgamma_(NDArray& x, NDArray& z) {
+void lgamma_(NDArray& x, NDArray& z) {
    //auto dtype = x.dataType();
    auto lgammaProc = LAMBDA_T(x_, dtype) {
        return T(DataTypeUtils::fromT<T>() == DataType::DOUBLE?::lgamma(x_): ::lgammaf(x_)); //math::nd4j_log<T,T>(math::nd4j_gamma<T,T>(x));
--- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
@ -535,8 +535,8 @@ namespace nd4j {
            // platform helpers use might be forbidden for various reasons, so we'll check it out first
            if (block->helpersAllowed() && nd4j::Environment::getInstance()->helpersAllowed()) {
                // if we have platform-specific helper for this op - invoke it
-                if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) {
-                    auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash());
+                if (OpRegistrator::getInstance()->hasHelper(this->getOpHash(), block->engine())) {
+                    auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash(), block->engine());
                    if (helper->isUsable(*block)) {
                        status = helper->invokeHelper(*block);
                        hasHelper = true;
--- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
@ -69,9 +69,9 @@ namespace nd4j {
            } else if (block.getTArguments()->size() > 0) {
                auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());

-                NDArray::prepareSpecialUse({z}, {x, &y});
-
-                NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
+                x->applyScalarArr(static_cast<nd4j::scalar::Ops>(opNum), y, *z);
+                // NDArray::prepareSpecialUse({z}, {x, &y});
+                // NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));

                manager.synchronize();
            } else {
--- a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
@ -173,15 +173,18 @@ namespace nd4j {
        }

        void OpRegistrator::registerHelper(nd4j::ops::platforms::PlatformHelper* op) {
-            if (_helpersLH.count(op->hash()) > 0)
+            std::pair<Nd4jLong, samediff::Engine> p = {op->hash(), op->engine()};
+            if (_helpersLH.count(p) > 0)
                throw std::runtime_error("Tried to double register PlatformHelper");

            _uniqueH.emplace_back(op);

-            std::pair<std::string, nd4j::ops::platforms::PlatformHelper*> pair(op->name(), op);
+            nd4j_debug("Adding helper for op \"%s\": [%lld - %i]\n", op->name().c_str(), op->hash(), (int) op->engine());
+
+            std::pair<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair({op->name(), op->engine()}, op);
            _helpersH.insert(pair);

-            std::pair<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> pair2(op->hash(), op);
+            std::pair<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair2(p, op);
            _helpersLH.insert(pair2);
        }

@ -227,15 +230,17 @@ namespace nd4j {
            return _declarablesD.at(name);
        }

-        nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash) {
-            if (_helpersLH.count(hash) == 0)
+        nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash, samediff::Engine engine) {
+            std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
+            if (_helpersLH.count(p) == 0)
                throw std::runtime_error("Requested helper can't be found");

-            return _helpersLH[hash];
+            return _helpersLH[p];
        }

-        bool OpRegistrator::hasHelper(Nd4jLong hash) {
-            return _helpersLH.count(hash) > 0;
+        bool OpRegistrator::hasHelper(Nd4jLong hash, samediff::Engine engine) {
+            std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
+            return _helpersLH.count(p) > 0;
        }

        int OpRegistrator::numberOfOperations() {
--- a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
+++ b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
@ -24,10 +24,11 @@
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PlatformHelper::PlatformHelper(const char *name) {
+            PlatformHelper::PlatformHelper(const char *name, samediff::Engine engine) {
                // we just store name/hash of target operation
                _name = std::string(name);
                _hash = HashHelper::getInstance()->getLongHash(_name);
+                _engine = engine;
            }

            nd4j::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) {
@ -74,6 +75,10 @@ namespace nd4j {
                return z;
            }

+            samediff::Engine PlatformHelper::engine() {
+                return _engine;
+            }
+
            std::string PlatformHelper::name() {
                return _name;
            }
--- a/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
@ -0,0 +1,275 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void batchnormCUDNN(const LaunchContext* context,
+                            const NDArray* input, const NDArray* mean, const NDArray* variance,
+                            const NDArray* gamma, const NDArray* beta,
+                                  NDArray* output,
+                            const double epsilon, const bool isSpatialMode) {
+
+
+    // input, output -> 4D:nchw, 5D:ncdhw
+    // mean, variance, gamma, beta -> 1xCx1x1 for 4D and 1xCx1x1x1 for 5D for BATCHNORM_MODE_SPATIAL mode
+    //                             -> 1xCxHxW for 4D and 1xCxDxHxW for 5D for BATCHNORM_MODE_PER_ACTIVATION mode
+
+    const cudnnDataType_t dataType = cudnnDataType(input->dataType());
+
+    const int xRank = input->rankOf();
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> xShape = input->getShapeAsVectorInt();               // input and output have same shapes
+
+    std::vector<int> paramsShape, paramsStrides;                                 // mean, variance, gamma and beta have same shapes
+    if(isSpatialMode) { // 1xCx1x1
+        const int iC = mean->lengthOf();
+        const int stride0 = mean->strideAt(0);
+        paramsShape   = xRank == 4 ? std::vector<int>({1, iC, 1, 1}) : std::vector<int>({1, iC, 1, 1, 1});
+        paramsStrides = xRank == 4 ? std::vector<int>({iC*stride0, stride0, 1, 1}) : std::vector<int>({iC*stride0, stride0, 1, 1, 1});
+    }
+    else {
+        paramsShape = mean->getShapeAsVectorInt();
+        paramsStrides = xRank == 4 ? std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3)}) : std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3), (int)mean->strideAt(4)});
+    }
+
+    std::vector<int> xStrides = {(int)input->strideAt(0),  (int)input->strideAt(1),  (int)input->strideAt(2),  (int)input->strideAt(3)};
+    std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3)};
+
+    if(xRank > 4) { // 5D
+        xStrides.push_back((int)input->strideAt(4));
+        zStrides.push_back((int)output->strideAt(4));
+    }
+
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+
+     // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(z, format, dataType, xRank, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(z, dataType, xRank, xShape.data(), zStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+
+    // mean, variance, gamma and beta descriptor, the same descriptor for all of them
+    cudnnTensorDescriptor_t params;
+    cudnnCreateTensorDescriptor(&params);
+    if(mean->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
+
+
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* ptrAlpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* ptrBeta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, mean, variance, gamma, beta});
+
+    // calculations
+    err = cudnnBatchNormalizationForwardInference(*handle, isSpatialMode ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION,
+                                                 ptrAlpha, ptrBeta,
+                                                 x, input->getSpecialBuffer(),
+                                                 z, output->getSpecialBuffer(),
+                                                 params,
+                                                 gamma ? gamma->getSpecialBuffer(): nullptr,
+                                                 beta  ? beta->getSpecialBuffer() : nullptr,
+                                                 mean->getSpecialBuffer(), variance->getSpecialBuffer(), epsilon);
+
+    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("batchnormCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+
+    NDArray::registerSpecialUse({output}, {input, mean, variance, gamma, beta});
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(batchnorm, ENGINE_CUDA) {
+
+    auto input    = INPUT_VARIABLE(0);
+    auto mean     = INPUT_VARIABLE(1);
+    auto variance = INPUT_VARIABLE(2);
+    NDArray* gamma    = nullptr;
+    NDArray* beta     = nullptr;
+
+    auto output   = OUTPUT_VARIABLE(0);
+
+    const bool   applyScale  = (bool)INT_ARG(0);
+    const bool   applyOffset = (bool)INT_ARG(1);
+    const double epsilon     = T_ARG(0);
+
+    if(applyScale)
+        gamma = INPUT_VARIABLE(3);
+    if(applyOffset)
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
+
+    const int numOfIntArgs = block.getIArguments()->size();
+    const int inRank = input->rankOf();
+
+    // get axes args to normalize input array over
+    std::vector<int> axes;
+    if(numOfIntArgs > 2)
+        for(int i = 2; i < numOfIntArgs; ++i)
+            axes.push_back(INT_ARG(i));
+    else
+        axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
+
+    const int numOfAxes = axes.size();
+    REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM CUDNN op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
+
+    // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
+    // for example if input shape is {2,3,4,5,6} and axes = {1,3}, then expected shape would be {1,3,1,5,1}, and if axes = {3}, then expected shape would be {5}
+    std::vector<Nd4jLong> expShape;
+    if(numOfAxes == 1)
+        expShape.push_back(input->sizeAt(axes[0]));
+    else {      // get, for example, something like {1, inputDim1, 1, inputDim3, 1} if axes = {1, 3}
+        expShape = std::vector<Nd4jLong>(inRank, 1);
+        for(uint i = 0; i < numOfAxes; ++i)
+            expShape[axes[i]] = input->sizeAt(axes[i]);
+    }
+
+    REQUIRE_TRUE(mean->isSameShape(expShape) , 0, "BATCHNORM CUDNN op: wrong shape of mean array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(mean).c_str());
+    REQUIRE_TRUE(variance->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of variance array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(variance).c_str());
+    if(gamma)
+        REQUIRE_TRUE(gamma->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of gamma array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(gamma).c_str());
+    if(beta)
+        REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
+
+    // types of all input arrays should be the same
+    for(int i = 1; i < block.width(); ++i)
+        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM CUDNN op: types of all input arrays should be the same !");
+
+    // cudnn supports NCHW format only
+    const bool needPermut = axes.size() == 1 && mean->lengthOf() == input->sizeAt(-1);
+
+    if(needPermut) {    // if NHWC
+        std::vector<int> perm = {0, 3, 1, 2};           // NHWC -> NCHW
+        input  = new NDArray(input->permute(perm));
+        output = new NDArray(output->permute(perm));
+    }
+
+    // calculations
+    batchnormCUDNN(block.launchContext(), input, mean, variance, gamma, beta, output, epsilon, axes.size() == 1);
+
+    if(needPermut) {
+        delete input;
+        delete output;
+    }
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(batchnorm, ENGINE_CUDA) {
+
+    const bool   applyScale  = (bool)INT_ARG(0);
+    const bool   applyOffset = (bool)INT_ARG(1);
+
+    NDArray* input     = INPUT_VARIABLE(0);
+    NDArray* mean      = INPUT_VARIABLE(1);
+    NDArray* variance  = INPUT_VARIABLE(2);
+    NDArray* gamma     = applyScale  ? INPUT_VARIABLE(3) : nullptr;
+    NDArray* beta      = applyOffset ? INPUT_VARIABLE(3 + (int)applyScale) : nullptr;
+
+    const int numOfIntArgs = block.getIArguments()->size();
+    const int xRank = input->rankOf();
+
+    // disable cudnn batchnorm so far
+    return false;
+
+    // *********************************** //
+    if(xRank != 4 && xRank != 5)
+        return false;
+
+    // *********************************** //
+    const bool badType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
+    if(badType)
+        return false;
+
+    // *********************************** //
+    // get axes args to normalize input array over
+    std::vector<int> axes;
+    if(numOfIntArgs > 2)
+        for(int i = 2; i < numOfIntArgs; ++i)
+            axes.push_back(INT_ARG(i));
+    else
+        axes.push_back(xRank-1);               // default dimension to reduce along is last dimension
+
+    if(axes.size() != 1 && axes.size() != 3 && axes.size() != 4)
+        return false;
+
+    // *********************************** //
+    bool allParamsHaveSameShapeAndStrides = shape::haveSameShapeAndStrides(mean->getShapeInfo(), variance->getShapeInfo());
+    if(gamma)
+        allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), gamma->getShapeInfo());
+    if(beta)
+        allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), beta->getShapeInfo());
+
+    if(!allParamsHaveSameShapeAndStrides)
+        return false;
+
+    // *********************************** //
+    bool isFormatGood = false;
+    if(axes.size() == 1)
+        isFormatGood = mean->lengthOf() == input->sizeAt(1) || mean->lengthOf() == input->sizeAt(-1);   // mean [C]
+    else {
+        auto inputShapeModif = input->getShapeAsVector();     // [dim0,dim1,dim2,dim3] 4D or [dim0,dim1,dim2,dim3,dim4]
+        inputShapeModif[0] = 1;
+        isFormatGood = mean->isSameShape(inputShapeModif);    // mean [1,dim1,dim2,dim3] 4D or [1,dim1,dim2,dim3,dim4]
+    }
+    if(!isFormatGood)
+        return false;
+
+    return true;
+}
+
+
+}
+}
+}
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
@ -0,0 +1,521 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void conv2dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kH, const int kW,
+                        const int sH, const int sW,
+                        const int pH, const int pW,
+                        const int dH, const int dW,
+                        const int paddingMode, const bool isNCHW) {
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void conv2dBpCUDNN(const LaunchContext* context,
+                          const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                          NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                          const int kH, const int kW,
+                          const int sH, const int sW,
+                          const int pH, const int pW,
+                          const int dH, const int dW,
+                          const int paddingMode, const bool isNCHW) {
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    bool isNCHW    = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong>  expectedWeightsShape = {kH, kW, iC, oC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if (bias) {
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+        REQUIRE_TRUE((bias->rankOf() == 1 && bias->strideAt(0) == 1) || (bias->rankOf() == 2 && bias->sizeAt(0) == 1 && bias->strideAt(1) == 1) || (bias->rankOf() == 2 && bias->sizeAt(1) == 1 && bias->strideAt(0) == 1), 0, "CUSTOM CONV2D CUDNN OP: bias array should be contiguous in memory !");
+    }
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC  --> oC, iC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    conv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    const int paddingMode = INT_ARG(8);                                  // 0-VALID, 1-SAME, 2-CAUSAL
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, oC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    int kH = INT_ARG(0);                                                        // filter(kernel) height
+    int kW = INT_ARG(1);                                                        // filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of output's gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    int trueoH, trueoW;          // true output height, width
+    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {oC, iC, kH, kW}, gradW->dataType(),   gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC  --> oC, iC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    conv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
+
+    newGradW->permutei({2,3,1,0});  // [oC, iC, kH, kW] -> [kH, kW, iC, oC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                           // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                           // [kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;             // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+    const int paddingMode = INT_ARG(8);                                             // 0-VALID, 1-SAME, 2-CAUSAL
+    const int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;    // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+
+
+
+
+
+
+// PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
+
+//     auto handle = reinterpret_cast<cudnnHandle_t *>(block.launchContext()->getCuDnnHandle());
+//     auto res = cudnnSetStream(*handle, *block.launchContext()->getCudaStream());
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("Can't set stream for cuDNN", res);
+
+//     auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+//     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+//     auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+//     auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+//     NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+//     int sH = INT_ARG(2);                                                        // strides height
+//     int sW = INT_ARG(3);                                                        // strides width
+//     int pH = INT_ARG(4);                                                        // paddings height
+//     int pW = INT_ARG(5);                                                        // paddings width
+//     int dH = INT_ARG(6);                                                        // dilations height
+//     int dW = INT_ARG(7);                                                        // dilations width
+//     int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
+//     bool isNCHW    = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+
+//     int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
+//     int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
+
+//     int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+//     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+//     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+//     ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, isSameMode);
+
+//     auto dtype = cudnnDataType(input->dataType());
+
+
+//     cudnnTensorDescriptor_t src;
+//     cudnnCreateTensorDescriptor(&src);
+//     res = cudnnSetTensor4dDescriptorEx(src, dtype, input->sizeAt(0), input->sizeAt(1), input->sizeAt(2), input->sizeAt(3), input->strideAt(0), input->strideAt(1), input->strideAt(2), input->strideAt(3));
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx src failed", res);
+
+//     // TODO: we definitely want NHWC here as well
+//     cudnnFilterDescriptor_t wght;
+//     cudnnCreateFilterDescriptor(&wght);
+//     res = cudnnSetFilter4dDescriptor(wght, dtype, CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetFilter4dDescriptor failed", res);
+
+//     cudnnConvolutionDescriptor_t cdc;
+//     cudnnCreateConvolutionDescriptor(&cdc);
+//     res = cudnnSetConvolution2dDescriptor(cdc, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, dtype);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetConvolution2dDescriptor failed", res);
+
+//     cudnnTensorDescriptor_t dst;
+//     cudnnCreateTensorDescriptor(&dst);
+//     res = cudnnSetTensor4dDescriptorEx(dst, dtype, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3), output->strideAt(0), output->strideAt(1), output->strideAt(2), output->strideAt(3));
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx dst failed", res);
+
+//     // TODO: workspace algorithms are supposed to be faster, so we should use it here if we have enough memory
+//     cudnnConvolutionFwdAlgo_t algo;
+//     res = cudnnGetConvolutionForwardAlgorithm(*handle, src, wght, cdc, dst, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnGetConvolutionForwardAlgorithm failed", res);
+
+//     // TODO: should be float if dtype is half/float, and double otherwise
+//     float alpha = 1.0f;
+//     float beta = 0.0f;
+//     res = cudnnConvolutionForward(*handle, &alpha, src, input->specialBuffer(), wght, weights->specialBuffer(), cdc, algo, nullptr, 0, &beta, dst, output->specialBuffer());
+//     if (res != 0)
+//         throw nd4j::cuda_exception::build("cudnnConvolutionForward failed", res);
+
+
+//     if (bias != nullptr) {
+//         cudnnTensorDescriptor_t bs;
+//         cudnnCreateTensorDescriptor(&bs);
+//         if (isNCHW) {
+//             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NCHW, dtype, 1, bias->lengthOf(), 1, 1);
+//             if (res != 0)
+//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//         } else {
+//             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NHWC, dtype, 1, 1, 1, bias->lengthOf());
+//             if (res != 0)
+//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//         }
+
+//         res = cudnnAddTensor(*handle, &alpha, bs, bias->specialBuffer(), &alpha, dst, output->specialBuffer());
+//         if (res != 0)
+//             throw nd4j::cuda_exception::build("cudnnAddTensor failed", res);
+//     }
+
+
+//     NDArray::registerSpecialUse({output}, {input, weights, bias});
+
+//     return Status::OK();
+// }
+
+
+}
+}
+}
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
@ -0,0 +1,453 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+//////////////////////////////////////////////////////////////////////////
+static void conv3dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kD, const int kH, const int kW,
+                        const int sD, const int sH, const int sW,
+                        const int pD, const int pH, const int pW,
+                        const int dD, const int dH, const int dW,
+                        const int paddingMode, const bool isNCDHW) {
+
+    const int numDims = 5;
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> pads        = {pD, pH, pW};
+    const std::vector<int> filtStrides = {sD, sH, sW};
+    const std::vector<int> dilations   = {dD, dH, dW};
+
+    const std::vector<int> xShape   = {bS, iC, iD, iH, iW};
+    const std::vector<int> zShape   = {bS, oC, oD, oH, oW};
+    const std::vector<int> wShape   = {oC, iC, kD, kH, kW};
+    const std::vector<int> bShape   = {1, (isNCDHW ? oC : 1), 1, 1, (isNCDHW ? 1 : oC)};
+
+    const std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
+    const std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3), (int)output->strideAt(4)};
+
+    cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilterNdDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
+    if(err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetFilterNdDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(z, format, cudnnDataType(output->dataType()), numDims, zShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(z, cudnnDataType(output->dataType()), numDims, zShape.data(), zStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensorNdDescriptorEx(b, format, cudnnDataType(bias->dataType()), numDims, bShape.data());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv3dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void conv3dBpCUDNN(const LaunchContext* context,
+                          const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                          NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                          const int kD, const int kH, const int kW,
+                          const int sD, const int sH, const int sW,
+                          const int pD, const int pH, const int pW,
+                          const int dD, const int dH, const int dW,
+                          const int paddingMode, const bool isNCDHW) {
+
+    const int numDims = 5;
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: can't set stream for cuDNN", err);
+
+    const std::vector<int> pads        = {pD, pH, pW};
+    const std::vector<int> filtStrides = {sD, sH, sW};
+    const std::vector<int> dilations   = {dD, dH, dW};
+
+    const std::vector<int> xShape  = {bS, iC, iD, iH, iW};
+    const std::vector<int> dzShape = {bS, oC, oD, oH, oW};
+    const std::vector<int> wShape  = {oC, iC, kD, kH, kW};
+    const std::vector<int> dbShape = {1, (int)(isNCDHW ? oC : 1), 1, 1, (int)(isNCDHW ? 1 : oC)};
+
+    const std::vector<int> xStrides  = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
+    const std::vector<int> dxStrides = {(int)gradI->strideAt(0), (int)gradI->strideAt(1), (int)gradI->strideAt(2), (int)gradI->strideAt(3), (int)gradI->strideAt(4)};
+    const std::vector<int> dzStrides = {(int)gradO->strideAt(0), (int)gradO->strideAt(1), (int)gradO->strideAt(2), (int)gradO->strideAt(3), (int)gradO->strideAt(4)};
+
+    cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(dz, format, cudnnDataType(gradO->dataType()), numDims, dzShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(dz, cudnnDataType(gradO->dataType()), numDims, dzShape.data(), dzStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensorNdDescriptorEx(dx, format, cudnnDataType(gradI->dataType()), numDims, xShape.data());
+    else
+        err = cudnnSetTensorNdDescriptor(dx, cudnnDataType(gradI->dataType()), numDims, xShape.data(), dxStrides.data());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilterNdDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
+    if(err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetFilterNdDescriptor failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensorNdDescriptorEx(db, format, cudnnDataType(gradB->dataType()), numDims, dbShape.data());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("conv3dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv3dnew, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)
+
+    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CONV3D CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
+
+    int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
+    int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
+    int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
+    int sD = INT_ARG(3);                                                        // strides depth
+    int sH = INT_ARG(4);                                                        // strides height
+    int sW = INT_ARG(5);                                                        // strides width
+    int pD = INT_ARG(6);                                                        // paddings depth
+    int pH = INT_ARG(7);                                                        // paddings height
+    int pW = INT_ARG(8);                                                        // paddings width
+    int dD = INT_ARG(9);                                                        // dilations depth
+    int dH = INT_ARG(10);                                                       // dilations height
+    int dW = INT_ARG(11);                                                       // dilations width
+    int paddingMode = INT_ARG(12);                                              // 0-SAME,  1-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CONV3D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if (bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC  --> oC, iC, kD, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
+
+    conv3dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW, paddingMode, isNCDHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(conv3dnew, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
+
+    int paddingMode = INT_ARG(12);                                       // 0-SAME,  1-VALID
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(conv3dnew_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, iC, oC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CONV3D_BP CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D_BP CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 5, 0,   "CONV3D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 5, but got %i instead !", gradO->rankOf());
+
+    int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
+    int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
+    int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
+    int sD = INT_ARG(3);                                                        // strides depth
+    int sH = INT_ARG(4);                                                        // strides height
+    int sW = INT_ARG(5);                                                        // strides width
+    int pD = INT_ARG(6);                                                        // paddings depth
+    int pH = INT_ARG(7);                                                        // paddings height
+    int pW = INT_ARG(8);                                                        // paddings width
+    int dD = INT_ARG(9);                                                        // dilations depth
+    int dH = INT_ARG(10);                                                       // dilations height
+    int dW = INT_ARG(11);                                                       // dilations width
+    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
+    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+
+    int trueoD, trueoH, trueoW;          // true output depth/height/width
+    ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);
+
+    REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D_BP CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
+
+    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
+    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CONV3D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(gradW->isSameShape(expectedWeightsShape), 0, "CONV3D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {oC, iC, kD, kH, kW}, gradW->dataType(),   gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
+    NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC  --> oC, iC, kD, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
+
+    conv3dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW,paddingMode,isNCDHW);
+
+    newGradW->permutei({2,3,4,1,0});    // [oC, iC, kD, kH, kW] -> [kD, kH, kW, iC, oC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCDHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3),  0,gradI->sizeAt(4)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,gradI->sizeAt(3),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(conv3dnew_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
+
+    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
+    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return isNCDHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+}
+}
+}
--- a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
@ -0,0 +1,158 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SD_CUDNNUTILS_H
+#define SD_CUDNNUTILS_H
+
+#include <ops/declarable/PlatformHelper.h>
+#include <ops/declarable/OpRegistrator.h>
+#include <platform_boilerplate.h>
+#include <exceptions/cuda_exception.h>
+#include <exceptions/datatype_exception.h>
+#include <dll.h>
+
+#include <cudnn.h>
+
+namespace nd4j {
+namespace ops {
+namespace platforms {
+
+    DECLARE_PLATFORM(conv2d, ENGINE_CUDA);
+    DECLARE_PLATFORM(conv2d_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(conv3dnew, ENGINE_CUDA);
+    DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CUDA);
+    DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CUDA);
+
+    DECLARE_PLATFORM(batchnorm, ENGINE_CUDA);
+    DECLARE_PLATFORM(batchnorm_bp, ENGINE_CUDA);
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE cudnnDataType_t cudnnDataType(nd4j::DataType dataType) {
+    switch (dataType) {
+        case nd4j::DataType::FLOAT32:
+            return CUDNN_DATA_FLOAT;
+        case nd4j::DataType::DOUBLE:
+            return CUDNN_DATA_DOUBLE;
+        case nd4j::DataType::HALF:
+            return CUDNN_DATA_HALF;
+        case nd4j::DataType::INT32:
+            return CUDNN_DATA_INT32;
+        case nd4j::DataType::INT8:
+            return CUDNN_DATA_INT8;
+        default:
+            throw datatype_exception::build("Unsupported data type", dataType);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE void checkConv2dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
+                                            const int iH, const int iW,
+                                            const int oH, const int oW,
+                                            const int kH, const int kW,
+                                            const int sH, const int sW,
+                                            const int pH, const int pW,
+                                            const int dH, const int dW,
+                                            const bool isNCHW) {
+
+    const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
+    const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
+
+    const bool isPHasymm = pH != (pHsum - pH);
+    const bool isPWasymm = pW != (pWsum - pW);
+
+    if(!isPHasymm && !isPWasymm)
+        return;
+
+    std::vector<Nd4jLong> newShape = input->getShapeAsVector();
+
+    const int iHposition = isNCHW ? 2 : 1;
+
+    if(isPHasymm)
+        newShape[iHposition] += 1;
+    if(isPWasymm)
+        newShape[iHposition + 1] += 1;
+
+    NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
+
+    if(isNCHW)
+        (*newInput)({0,0,  0,0,  0,input->sizeAt(2),  0,input->sizeAt(3)}).assign(input);
+    else
+        (*newInput)({0,0,  0,input->sizeAt(1),  0,input->sizeAt(2),  0,0}).assign(input);
+
+    input = newInput;
+
+    if(gradI != nullptr)
+        gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+FORCEINLINE void checkConv3dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
+                                            const int iD, const int iH, const int iW,
+                                            const int oD, const int oH, const int oW,
+                                            const int kD, const int kH, const int kW,
+                                            const int sD, const int sH, const int sW,
+                                            const int pD, const int pH, const int pW,
+                                            const int dD, const int dH, const int dW,
+                                            const bool isNCDHW) {
+
+    const auto pDsum = ((oD - 1) * sD + ((kD - 1) * dD + 1) - iD);
+    const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
+    const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
+
+    const bool isPDasymm = pD != (pDsum - pD);
+    const bool isPHasymm = pH != (pHsum - pH);
+    const bool isPWasymm = pW != (pWsum - pW);
+
+    if(!isPDasymm && !isPHasymm && !isPWasymm)
+        return;
+
+    std::vector<Nd4jLong> newShape = input->getShapeAsVector();
+
+    const int iDposition = isNCDHW ? 2 : 1;
+
+    if(isPDasymm)
+        newShape[iDposition] += 1;
+    if(isPHasymm)
+        newShape[iDposition + 1] += 1;
+    if(isPWasymm)
+        newShape[iDposition + 2] += 1;
+
+    NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
+
+    if(isNCDHW)
+        (*newInput)({0,0,  0,0,  0,input->sizeAt(2),  0,input->sizeAt(3),  0,input->sizeAt(4)}).assign(input);
+    else
+        (*newInput)({0,0,  0,input->sizeAt(1),  0,input->sizeAt(2),  0,input->sizeAt(3),  0,0}).assign(input);
+
+    input = newInput;
+
+    if(gradI != nullptr)
+        gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
+}
+
+}
+}
+}
+
+#endif //SD_CUDNNUTILS_H
--- a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
@ -0,0 +1,443 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include "cudnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace nd4j      {
+namespace ops       {
+namespace platforms {
+
+
+//////////////////////////////////////////////////////////////////////////
+static void depthwiseConv2dCUDNN(const LaunchContext* context,
+                        const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
+                        const int kH, const int kW,
+                        const int sH, const int sW,
+                        const int pH, const int pW,
+                        const int dH, const int dW,
+                        const int paddingMode, const bool isNCHW) {
+
+    // cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
+
+    // input [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
+    // weights [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
+    // bias [oC], may be nullptr
+    // output [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
+    // oC = iC*mC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;           // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(1);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // weights descriptor
+    cudnnFilterDescriptor_t w;
+    cudnnCreateFilterDescriptor(&w);
+    err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+
+    // output descriptor
+    cudnnTensorDescriptor_t z;
+    cudnnCreateTensorDescriptor(&z);
+    if(output->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolutionGroupCount failed", err);
+
+    // algorithm description
+    cudnnConvolutionFwdAlgo_t algo;
+    err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+
+    // allocate auxiliary device memory, abbreviation ws means workspace
+    size_t wsSize;
+    err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    void* wsData;
+    auto cudaErr = cudaMalloc(&wsData, wsSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({output}, {input, weights, bias});
+
+    // run calculation
+    err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnConvolutionForward failed", err);
+
+    // add bias if it is present
+    if (bias != nullptr) {
+
+        cudnnTensorDescriptor_t b;
+        cudnnCreateTensorDescriptor(&b);
+        err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnAddTensor bias failed", err);
+    }
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("depthwiseConv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+
+    NDArray::registerSpecialUse({output}, {input, weights, bias});
+}
+
+//////////////////////////////////////////////////////////////////////////
+static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
+                                const NDArray* input, const NDArray* weights, const NDArray* gradO,
+                                NDArray* gradI, NDArray* gradW, NDArray* gradB,
+                                const int kH, const int kW,
+                                const int sH, const int sW,
+                                const int pH, const int pW,
+                                const int dH, const int dW,
+                                const int paddingMode, const bool isNCHW) {
+
+    // cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
+
+    // input, gradI [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
+    // weights, gradW [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
+    // gradB [oC], may be nullptr
+    // gradO [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
+    // oC = iC*mC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;           // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(1);
+
+    auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
+    cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: can't set stream for cuDNN", err);
+
+    cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+    // input descriptor
+    cudnnTensorDescriptor_t x;
+    cudnnCreateTensorDescriptor(&x);
+    if(input->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+
+    // gradO descriptor
+    cudnnTensorDescriptor_t dz;
+    cudnnCreateTensorDescriptor(&dz);
+    if(gradO->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+
+    // gradI descriptor
+    cudnnTensorDescriptor_t dx;
+    cudnnCreateTensorDescriptor(&dx);
+    if(gradI->ews() == 1)
+        err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
+    else
+        err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+
+    // gradW descriptor
+    cudnnFilterDescriptor_t dw;
+    cudnnCreateFilterDescriptor(&dw);
+    err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
+    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+
+    // description of convolution
+    cudnnConvolutionDescriptor_t conv;
+    cudnnCreateConvolutionDescriptor(&conv);
+    err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolutionGroupCount failed", err);
+
+    // gradW algorithm description
+    cudnnConvolutionBwdFilterAlgo_t algoGradW;
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+
+    // gradI algorithm description
+    cudnnConvolutionBwdDataAlgo_t algoGradI;
+    err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+
+    // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
+    size_t wsGradWSize;
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    void* wsGradWData;
+    auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+
+    // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
+    size_t wsGradISize;
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    void* wsGradIData;
+    cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    // provide scaling parameters
+    const float  alpha32(1), beta32(0);
+    const double alpha64(1), beta64(0);
+    const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
+    const void* beta  = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32)  : reinterpret_cast<const void*>(&beta64);
+
+    NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+
+    // run calculation for gradB (if not nullptr)
+    if(gradB != nullptr) {
+        cudnnTensorDescriptor_t db;
+        cudnnCreateTensorDescriptor(&db);
+        err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+
+        err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
+        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+    }
+
+    // run calculation for gradW
+    err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+
+    // run calculation for gradI
+    err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
+    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+
+    // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
+    // if (cudaErr != 0)
+    //     throw cuda_exception::build("depthwiseConv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
+
+    cudaErr = cudaFree(wsGradWData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    cudaErr = cudaFree(wsGradIData);
+    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+
+    NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(depthwise_conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC] = iC*mC
+
+    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;      // INT_ARG(9): 0-NCHW,  1-NHWC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    REQUIRE_TRUE(output->sizeAt(indIOioC) == iC*mC, 0, "DEPTHWISECONV2D CUDNN OP: the output_channels must be equal to input_channels * channels_multiplier = %i !", iC*mC);
+    if (bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+    NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support format {oC, iC/groupCount, kH, kW}
+    newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC  --> iC, mC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = nullptr;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    depthwiseConv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
+
+    if(newInput != input)
+        delete newInput;
+
+    delete newWeights;
+
+    return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_CHECK(depthwise_conv2d, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC] = iC*mC
+
+    const int paddingMode = INT_ARG(8);                                  // 0-VALID, 1-SAME, 2-CAUSAL
+
+    const int mC = weights->sizeAt(3);
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return mC == 1 && paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
+}
+
+//////////////////////////////////////////////////////////////////////////
+PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC] = [iC*mC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+
+    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, mC] always
+    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+
+    REQUIRE_TRUE(input->rankOf()   == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
+    REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
+    REQUIRE_TRUE(gradO->rankOf() == 4, 0,   "DEPTHWISECONV2D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
+
+    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
+    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
+    int sH = INT_ARG(2);                                                        // strides height
+    int sW = INT_ARG(3);                                                        // strides width
+    int pH = INT_ARG(4);                                                        // paddings height
+    int pW = INT_ARG(5);                                                        // paddings width
+    int dH = INT_ARG(6);                                                        // dilations height
+    int dW = INT_ARG(7);                                                        // dilations width
+    int paddingMode = INT_ARG(8);                                               // 0-VALID, 1-SAME
+    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    int trueoH, trueoW;          // correct output height, width
+    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
+    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
+    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
+    if(bias)
+        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
+
+
+    NDArray* newGradW   = new NDArray(gradW->ordering(),   {iC, mC, kH, kW}, gradW->dataType(),   gradW->getContext());     // cudnn support format {oC, iC/groupCount, kH, kW}
+    NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext());
+
+    newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC  --> iC, mC, kH, kW)
+
+    NDArray* newInput = input;
+    NDArray* newGradI = gradI;
+    if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
+        checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
+
+    depthwiseConv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO,   newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
+
+    newGradW->permutei({2,3,0,1});  // [iC, mC, kH, kW] -> [kH, kW, iC, mC]
+    gradW->assign(newGradW);
+
+    if(newInput != input) {
+
+        if(isNCHW)
+            gradI->assign((*newGradI)({0,0,  0,0,  0,gradI->sizeAt(2),  0,gradI->sizeAt(3)}));
+        else
+            gradI->assign((*newGradI)({0,0,  0,gradI->sizeAt(1),  0,gradI->sizeAt(2),  0,0}));
+
+        delete newInput;
+        delete newGradI;
+    }
+
+    delete newWeights;
+    delete newGradW;
+
+    return Status::OK();
+}
+
+PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CUDA) {
+
+    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
+    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC] = [iC*mC]
+    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+
+    const int paddingMode = INT_ARG(8);                                             // 0-VALID, 1-SAME, 2-CAUSAL
+    const int isNCHW      = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;    // INT_ARG(9): 0-NCHW, 1-NHWC
+
+    const int mC = weights->sizeAt(3);
+
+    const bool badInputType   = input->dataType()   != DataType::DOUBLE && input->dataType()   != DataType::FLOAT32 && input->dataType()   != DataType::HALF;
+    const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
+    const bool badGradOType   = gradO->dataType()   != DataType::DOUBLE && gradO->dataType()   != DataType::FLOAT32 && gradO->dataType()   != DataType::HALF;
+    const bool badBiasType    = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
+
+    return mC == 1 && isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
+}
+
+
+}
+}
+}
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
@ -28,11 +28,12 @@
 #include <ops/declarable/helpers/convolutions.h>

 using namespace dnnl;
+using namespace samediff;

 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(avgpool2d) {
+            PLATFORM_IMPL(avgpool2d, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);

                REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@ -128,7 +129,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(avgpool2d) {
+            PLATFORM_CHECK(avgpool2d, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp
@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(avgpool2d_bp) {
+            PLATFORM_IMPL(avgpool2d_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
                auto gradO = INPUT_VARIABLE(
@ -138,7 +138,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(avgpool2d_bp) {
+            PLATFORM_CHECK(avgpool2d_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(avgpool3dnew) {
+            PLATFORM_IMPL(avgpool3dnew, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                auto output = OUTPUT_VARIABLE(
@ -130,7 +130,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(avgpool3dnew) {
+            PLATFORM_CHECK(avgpool3dnew, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp
@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(avgpool3dnew_bp) {
+            PLATFORM_IMPL(avgpool3dnew_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                          // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                auto gradO = INPUT_VARIABLE(
@ -143,7 +143,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(avgpool3dnew_bp) {
+            PLATFORM_CHECK(avgpool3dnew_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
@ -375,7 +375,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
    *dLdI += xMinusMean;
 }

-PLATFORM_IMPL(batchnorm) {
+PLATFORM_IMPL(batchnorm, ENGINE_CPU) {

    auto input    = INPUT_VARIABLE(0);  // 2D:nc, 4D:nchw, 5D:ncdhw
    auto mean     = INPUT_VARIABLE(1);  // [c]
@ -455,7 +455,7 @@ PLATFORM_IMPL(batchnorm) {
 }

 //////////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(batchnorm) {
+PLATFORM_CHECK(batchnorm, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    // if (::optimalLevel() < 2)
    //     return false;
@ -632,7 +632,7 @@ PLATFORM_CHECK(batchnorm) {


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(batchnorm_bp) {
+PLATFORM_IMPL(batchnorm_bp, ENGINE_CPU) {

    NDArray* input    = INPUT_VARIABLE(0);                  // 2D:nc, 4D:nchw, 5D:ncdhw
    NDArray* mean     = INPUT_VARIABLE(1);                  // [c]
@ -735,7 +735,7 @@ PLATFORM_IMPL(batchnorm_bp) {
 }

 //////////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(batchnorm_bp) {
+PLATFORM_CHECK(batchnorm_bp, ENGINE_CPU) {
    NDArray* input    = INPUT_VARIABLE(0);      // 2D:nc, 4D:nchw, 5D:ncdhw
    NDArray* mean     = INPUT_VARIABLE(1);      // [c]
    NDArray* variance = INPUT_VARIABLE(2);      // [c]
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
@ -113,7 +113,7 @@ static void conv2d_mkldnn(nd4j::graph::Context &block, const NDArray *input, con
 }

 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv2d) {
+PLATFORM_IMPL(conv2d, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
    auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
@ -137,7 +137,7 @@ PLATFORM_IMPL(conv2d) {
    return Status::OK();
 }

-PLATFORM_CHECK(conv2d) {
+PLATFORM_CHECK(conv2d, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    if (::optimalLevel() < 2)
        return false;
@ -151,7 +151,7 @@ PLATFORM_CHECK(conv2d) {
 }

 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv2d_bp) {
+PLATFORM_IMPL(conv2d_bp, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
    auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
@ -328,7 +328,7 @@ PLATFORM_IMPL(conv2d_bp) {
    return Status::OK();
 }

-PLATFORM_CHECK(conv2d_bp) {
+PLATFORM_CHECK(conv2d_bp, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
    auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
@ -34,7 +34,7 @@ namespace ops       {
 namespace platforms {

 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv3dnew) {
+PLATFORM_IMPL(conv3dnew, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(
            0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
@ -150,7 +150,7 @@ PLATFORM_IMPL(conv3dnew) {
    return Status::OK();
 }

-PLATFORM_CHECK(conv3dnew) {
+PLATFORM_CHECK(conv3dnew, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    if (::optimalLevel() < 2)
        return false;
@ -167,7 +167,7 @@ PLATFORM_CHECK(conv3dnew) {


 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(conv3dnew_bp) {
+PLATFORM_IMPL(conv3dnew_bp, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(
            0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(
@ -374,7 +374,7 @@ PLATFORM_IMPL(conv3dnew_bp) {
    return Status::OK();
 }

-PLATFORM_CHECK(conv3dnew_bp) {
+PLATFORM_CHECK(conv3dnew_bp, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(
            0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
@ -349,7 +349,7 @@ static void deconv2dBackPropMKLDNN(const NDArray* input, const NDArray* weights,


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d) {
+PLATFORM_IMPL(deconv2d, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, oC, iC] always
@ -406,7 +406,7 @@ PLATFORM_IMPL(deconv2d) {
    return Status::OK();
 }

-PLATFORM_CHECK(deconv2d) {
+PLATFORM_CHECK(deconv2d, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    // if (::optimalLevel() < 2)
    //     return false;
@ -435,7 +435,7 @@ PLATFORM_CHECK(deconv2d) {


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d_bp) {
+PLATFORM_IMPL(deconv2d_bp, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC] always
@ -506,7 +506,7 @@ PLATFORM_IMPL(deconv2d_bp) {
    return Status::OK();
 }

-PLATFORM_CHECK(deconv2d_bp) {
+PLATFORM_CHECK(deconv2d_bp, ENGINE_CPU) {
    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC] always
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
@ -145,7 +145,7 @@ static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* grad


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv2d_tf) {
+PLATFORM_IMPL(deconv2d_tf, ENGINE_CPU) {

    auto gradO      = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
    auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
@ -222,7 +222,7 @@ PLATFORM_IMPL(deconv2d_tf) {
    return Status::OK();
 }

-PLATFORM_CHECK(deconv2d_tf) {
+PLATFORM_CHECK(deconv2d_tf, ENGINE_CPU) {
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
    auto gradO   = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
    auto gradI   = OUTPUT_VARIABLE(0);                                               // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW), gradI
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
@ -360,7 +360,7 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv3d) {
+PLATFORM_IMPL(deconv3d, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, oC, iC] always
@ -421,7 +421,7 @@ PLATFORM_IMPL(deconv3d) {
    return Status::OK();
 }

-PLATFORM_CHECK(deconv3d) {
+PLATFORM_CHECK(deconv3d, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    // if (::optimalLevel() < 2)
    //     return false;
@ -451,7 +451,7 @@ PLATFORM_CHECK(deconv3d) {


 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(deconv3d_bp) {
+PLATFORM_IMPL(deconv3d_bp, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC] always
@ -525,7 +525,7 @@ PLATFORM_IMPL(deconv3d_bp) {
 }


-PLATFORM_CHECK(deconv3d_bp) {
+PLATFORM_CHECK(deconv3d_bp, ENGINE_CPU) {
    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NHWC) or [bS, iD, iC, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC] always
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
--- a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
@ -362,7 +362,7 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w


 //////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(depthwise_conv2d) {
+PLATFORM_IMPL(depthwise_conv2d, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
@ -400,7 +400,7 @@ PLATFORM_IMPL(depthwise_conv2d) {
 }

 //////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(depthwise_conv2d) {
+PLATFORM_CHECK(depthwise_conv2d, ENGINE_CPU) {
    // we don't want to use mkldnn if cpu doesn't support avx/avx2
    if (::optimalLevel() < 2)
        return false;
@ -427,7 +427,7 @@ PLATFORM_CHECK(depthwise_conv2d) {
 }

 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(depthwise_conv2d_bp) {
+PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
@ -476,7 +476,7 @@ PLATFORM_IMPL(depthwise_conv2d_bp) {
 }

 //////////////////////////////////////////////////////////////////////
-PLATFORM_CHECK(depthwise_conv2d_bp) {
+PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CPU) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(lrn) {
+            PLATFORM_IMPL(lrn, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

@ -82,7 +82,7 @@ namespace nd4j {
                return Status::OK();
            };

-            PLATFORM_CHECK(lrn) {
+            PLATFORM_CHECK(lrn, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
@ -365,7 +365,7 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
 }

 //////////////////////////////////////////////////////////////////////////
-PLATFORM_IMPL(lstmLayer) {
+PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {

    const auto dataFormat    = INT_ARG(0);    // for unidirectional: 0 = [sL, bS, nIn], 1 = [bS, sL ,nIn], 2 = [bS, nIn, sL], for bidirectional: 3 = [sL, 2, bS, nOut] (for ONNX)
    const auto directionMode = INT_ARG(1);    // direction: 0 = fwd, 1 = bwd, 2 = bidirectional sum, 3 = bidirectional concat, 4 = bidirectional extra output dim (in conjunction with format dataFormat = 3)
@ -493,7 +493,7 @@ PLATFORM_IMPL(lstmLayer) {
    return Status::OK();
 }

-PLATFORM_CHECK(lstmLayer) {
+PLATFORM_CHECK(lstmLayer, ENGINE_CPU) {
    const auto hasBiases  = B_ARG(0);   // indicates whether biases array is provided
    const auto hasInitH   = B_ARG(2);   // indicates whether initial output is provided
    const auto hasInitC   = B_ARG(3);   // indicates whether initial cell state is provided
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(maxpool2d) {
+            PLATFORM_IMPL(maxpool2d, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);

                REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@ -134,7 +134,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(maxpool2d) {
+            PLATFORM_CHECK(maxpool2d, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp
@ -32,7 +32,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(maxpool2d_bp) {
+            PLATFORM_IMPL(maxpool2d_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
                auto gradO = INPUT_VARIABLE(
@ -163,7 +163,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(maxpool2d_bp) {
+            PLATFORM_CHECK(maxpool2d_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(maxpool3dnew) {
+            PLATFORM_IMPL(maxpool3dnew, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                auto output = OUTPUT_VARIABLE(
@ -140,7 +140,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(maxpool3dnew) {
+            PLATFORM_CHECK(maxpool3dnew, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp
@ -31,7 +31,7 @@ using namespace dnnl;
 namespace nd4j {
    namespace ops {
        namespace platforms {
-            PLATFORM_IMPL(maxpool3dnew_bp) {
+            PLATFORM_IMPL(maxpool3dnew_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(
                        0);                          // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
                auto gradO = INPUT_VARIABLE(
@ -170,7 +170,7 @@ namespace nd4j {
                return Status::OK();
            }

-            PLATFORM_CHECK(maxpool3dnew_bp) {
+            PLATFORM_CHECK(maxpool3dnew_bp, ENGINE_CPU) {
                auto input = INPUT_VARIABLE(0);
                auto output = OUTPUT_VARIABLE(0);

--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
@ -29,6 +29,8 @@
 #include <ops/declarable/PlatformHelper.h>
 #include <platform_boilerplate.h>

+using namespace samediff;
+

 namespace nd4j{
    namespace ops {
@ -36,50 +38,51 @@ namespace nd4j{
            /**
             * Here we actually declare our platform helpers
             */
-            DECLARE_PLATFORM(conv2d);
+            DECLARE_PLATFORM(conv2d, ENGINE_CPU);

-            DECLARE_PLATFORM(conv2d_bp);
+            DECLARE_PLATFORM(conv2d_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(avgpool2d);
+            DECLARE_PLATFORM(avgpool2d, ENGINE_CPU);

-            DECLARE_PLATFORM(avgpool2d_bp);
+            DECLARE_PLATFORM(avgpool2d_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(maxpool2d);
+            DECLARE_PLATFORM(maxpool2d, ENGINE_CPU);

-            DECLARE_PLATFORM(maxpool2d_bp);
+            DECLARE_PLATFORM(maxpool2d_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(conv3dnew);
+            DECLARE_PLATFORM(conv3dnew, ENGINE_CPU);

-            DECLARE_PLATFORM(conv3dnew_bp);
+            DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(maxpool3dnew);
+            DECLARE_PLATFORM(maxpool3dnew, ENGINE_CPU);

-            DECLARE_PLATFORM(maxpool3dnew_bp);
+            DECLARE_PLATFORM(maxpool3dnew_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(avgpool3dnew);
+            DECLARE_PLATFORM(avgpool3dnew, ENGINE_CPU);

-            DECLARE_PLATFORM(avgpool3dnew_bp);
+            DECLARE_PLATFORM(avgpool3dnew_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(lrn);
+            DECLARE_PLATFORM(lrn, ENGINE_CPU);

-            DECLARE_PLATFORM(batchnorm);
+            DECLARE_PLATFORM(batchnorm, ENGINE_CPU);

-            DECLARE_PLATFORM(batchnorm_bp);
+            DECLARE_PLATFORM(batchnorm_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(lstmLayer);
+            DECLARE_PLATFORM(lstmLayer, ENGINE_CPU);

-            DECLARE_PLATFORM(deconv2d);
+            DECLARE_PLATFORM(deconv2d, ENGINE_CPU);

-            DECLARE_PLATFORM(deconv2d_tf);
+            DECLARE_PLATFORM(deconv2d_tf, ENGINE_CPU);

-            DECLARE_PLATFORM(deconv3d);
+            DECLARE_PLATFORM(deconv3d, ENGINE_CPU);

-            DECLARE_PLATFORM(deconv2d_bp);
+            DECLARE_PLATFORM(deconv2d_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(deconv3d_bp);
+            DECLARE_PLATFORM(deconv3d_bp, ENGINE_CPU);

-            DECLARE_PLATFORM(depthwise_conv2d);
-            DECLARE_PLATFORM(depthwise_conv2d_bp);
+            DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CPU);
+            
+            DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CPU);
        }
    }

--- a/libnd4j/include/platform_boilerplate.h
+++ b/libnd4j/include/platform_boilerplate.h
@ -21,25 +21,37 @@
 #ifndef SD_PLATFORM_BOILERPLATE_H
 #define SD_PLATFORM_BOILERPLATE_H

-
-#define DECLARE_PLATFORM(NAME)      class ND4J_EXPORT PLATFORM_##NAME : public PlatformHelper {\
-                                    public: \
-                                        PLATFORM_##NAME() :  PlatformHelper(#NAME) { } \
-                                        bool isUsable(graph::Context &context) override; \
-                                        Nd4jStatus invokeHelper(graph::Context &context) override; \
-                                    };
-
-#define PLATFORM_IMPL(NAME)         struct ND4J_EXPORT __registratorPlatformHelper_##NAME { \
-                                        __registratorPlatformHelper_##NAME() { \
-                                            auto helper = new PLATFORM_##NAME(); \
-                                            OpRegistrator::getInstance()->registerHelper(helper); \
-                                        } \
-                                    }; \
-                                    static __registratorPlatformHelper_##NAME platformHelper_##NAME; \
-                                    Nd4jStatus PLATFORM_##NAME::invokeHelper(nd4j::graph::Context &block)
+#include <execution/Engine.h>


-#define PLATFORM_CHECK(NAME)        bool PLATFORM_##NAME::isUsable(graph::Context &block)
+
+#define CONCATP(A,B) A ##_##B
+
+
+#define DECLARE_PLATFORM_F(NAME, ENGINE, CNAME)      class ND4J_EXPORT PLATFORM_##CNAME : public PlatformHelper {\
+                                                     public: \
+                                                        PLATFORM_##CNAME() :  PlatformHelper(#NAME, samediff::Engine::ENGINE) { } \
+                                                        bool isUsable(graph::Context &context) override; \
+                                                        Nd4jStatus invokeHelper(graph::Context &context) override; \
+                                                    };
+
+#define DECLARE_PLATFORM(NAME, ENGINE) DECLARE_PLATFORM_F(NAME, ENGINE, NAME ##_## ENGINE)
+
+#define PLATFORM_IMPL_F(NAME, ENGINE, CNAME)         struct ND4J_EXPORT __registratorPlatformHelper_##CNAME { \
+                                                        __registratorPlatformHelper_##CNAME() { \
+                                                            auto helper = new PLATFORM_##CNAME(); \
+                                                            OpRegistrator::getInstance()->registerHelper(helper); \
+                                                        } \
+                                                    }; \
+                                                    static __registratorPlatformHelper_##CNAME platformHelper_##CNAME; \
+                                                    Nd4jStatus PLATFORM_##CNAME::invokeHelper(nd4j::graph::Context &block)
+
+
+#define PLATFORM_IMPL(NAME, ENGINE) PLATFORM_IMPL_F(NAME, ENGINE, NAME ##_## ENGINE)
+
+
+#define PLATFORM_CHECK_F(NAME, ENGINE, CNAME)        bool PLATFORM_##CNAME::isUsable(graph::Context &block)
+#define PLATFORM_CHECK(NAME, ENGINE) PLATFORM_CHECK_F(NAME, ENGINE, NAME ##_## ENGINE)


 #endif //SD_PLATFORM_BOILERPLATE_H
--- a/libnd4j/include/play.h
+++ b/libnd4j/include/play.h
@ -21,8 +21,9 @@
 #ifndef LIBND4J_PLAY_H
 #define LIBND4J_PLAY_H

-#include <type_boilerplate.h>
-
+//#include <type_boilerplate.h>
+#include <platform_boilerplate.h>
+/*
 #define DATA_TYPES \
        (DATA_FLOAT, float) ,\
        (DATA_DOUBLE, double)
@ -41,6 +42,9 @@


 BUILD_SINGLE_TEMPLATE_TWICE(template class functionName, , DATA_TYPES)
+ */
+
+DECLARE_PLATFORM(conv2d, ENGINE_CPU)

 //BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functionName, (signature), DATA_TYPES, Y_TYPES);

--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@ -135,12 +135,18 @@ elseif(CUDA_BLAS)

 	add_executable(runtests ${TEST_SOURCES})

-    message("MSVC runtime for tests: ${MSVC_RT_LIB}")
+    if (WIN32)
+        message("MSVC runtime for tests: ${MSVC_RT_LIB}")
+    endif()

    # applies to windows only
    set_property(TARGET runtests PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
    set_property(TARGET gtest PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
    set_property(TARGET gtest_main PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")

-	target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} gtest gtest_main)
+    if (HAVE_CUDNN)
+        message("CUDNN library: ${CUDNN}")
+    endif()
+
+	target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN} gtest gtest_main)
 endif()
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
@ -2416,7 +2416,7 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_9) {
    ASSERT_EQ(Status::OK(), results->status());

    ASSERT_TRUE(expOutput.isSameShape(output));
-    ASSERT_TRUE(expOutput.equalsTo(output));
+    ASSERT_TRUE(expOutput.equalsTo(output, 1e-4));

    delete results;
 }
--- a/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
@ -0,0 +1,128 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <initializer_list>
+#include <NDArrayFactory.h>
+#include <ops/declarable/PlatformHelper.h>
+#include <ops/declarable/CustomOperations.h>
+#include <execution/Engine.h>
+
+#ifdef HAVE_CUDNN
+
+#include <ops/declarable/platform/cudnn/cudnnUtils.h>
+
+#endif
+
+using namespace nd4j;
+
+class CuDnnTests : public testing::Test {
+public:
+
+};
+
+static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*> helpers) {
+
+    for (auto v:helpers) {
+        nd4j_printf("Initialized [%s]\n", v->name().c_str());
+    }
+}
+
+
+TEST_F(CuDnnTests, helpers_includer) {
+    // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
+#ifdef HAVE_CUDNN
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CUDA conv2d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CUDA conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_conv3dnew_ENGINE_CUDA conv3dnew;
+    nd4j::ops::platforms::PLATFORM_conv3dnew_bp_ENGINE_CUDA conv3dnew_bp;
+    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_ENGINE_CUDA depthwise_conv2d;
+    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_bp_ENGINE_CUDA depthwise_conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CUDA batchnorm;
+
+    printer({&conv2d});
+    printer({&conv2d_bp});
+    printer({&conv3dnew});
+    printer({&conv3dnew_bp});
+    printer({&depthwise_conv2d});
+    printer({&depthwise_conv2d_bp});
+    printer({&batchnorm});
+#endif
+}
+
+
+TEST_F(CuDnnTests, mixed_helpers_test_1) {
+#if defined(HAVE_CUDNN) && defined (HAVE_MKLDNN)
+    nd4j_printf("Mixed platforms test\n", "");
+
+
+    int bS=2, iH=4,iW=3,  iC=4,oC=3,  kH=3,kW=2,  sH=1,sW=1,  pH=0,pW=0,  dH=1,dW=1;
+    int       oH=2,oW=2;
+    int paddingMode = 0;             // 1-SAME, 0-VALID;
+    int dataFormat  = 0;             // 1-NHWC, 0-NCHW
+
+    auto input    = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
+    auto weights  = NDArrayFactory::create<float>('c', {oC, iC, kH, kW});
+    auto bias     = NDArrayFactory::create<float>('c', {oC}, {1,2,3});
+
+    auto expOutput = NDArrayFactory::create<float>('c', {bS, oC, oH, oW}, {61.f,   61.f,  61.f,   61.f, 177.2f,  177.2f, 177.2f,  177.2f, 293.4f,  293.4f, 293.4f,  293.4f,  61.f,   61.f,  61.f,   61.f, 177.2f,  177.2f, 177.2f,  177.2f, 293.4f,  293.4f, 293.4f,  293.4f});
+    auto zCUDA = expOutput.like();
+    auto zMKL = expOutput.like();
+
+    input = 2.;
+    weights.linspace(0.1, 0.1);
+    weights.permutei({2,3,1,0});
+
+    input.syncToHost();
+    weights.syncToHost();
+    bias.syncToHost();
+
+    nd4j::ops::conv2d op;
+
+    // cuDNN part
+    Context cuda(1);
+    cuda.setTargetEngine(samediff::Engine::ENGINE_CUDA);
+    cuda.setInputArray(0, &input);
+    cuda.setInputArray(1, &weights);
+    cuda.setInputArray(2, &bias);
+    cuda.setOutputArray(0, &zCUDA);
+    cuda.setIArguments({kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    auto statusCUDA = op.execute(&cuda);
+
+    ASSERT_EQ(Status::OK(), statusCUDA);
+    ASSERT_EQ(expOutput, zCUDA);
+
+    // MKL-DNN part
+    Context mkl(1);
+    mkl.setTargetEngine(samediff::Engine::ENGINE_CPU);
+    mkl.setInputArray(0, &input);
+    mkl.setInputArray(1, &weights);
+    mkl.setInputArray(2, &bias);
+    mkl.setOutputArray(0, &zMKL);
+    mkl.setIArguments({kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    auto statusMKL = op.execute(&mkl);
+
+    zMKL.tickWriteHost();
+
+    ASSERT_EQ(Status::OK(), statusMKL);
+    ASSERT_EQ(expOutput, zMKL);
+#endif
+}
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@ -3280,209 +3280,6 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) {
    delete results;
 }

-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test1) {
-
-    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test2) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {4});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {4});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {4});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {4});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
-                                            1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
-
-    input.linspace(0.1, 0.1);
-    mean.assign(1.);
-    variance.assign(0.5);
-    gamma.assign(1.2);
-    beta.assign(1.);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test3) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
-                                            0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test4) {
-
-    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
-    auto mean     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
-    auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
-    auto gamma    = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
-    auto beta     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
-
-    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
-                                            0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
-
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test5) {
-
-    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,4,2,2}, { 11.612187f,  11.442483f,  11.272779f,  11.103076f,  18.990039f,  19.145418f,  19.300796f,  19.456175f,  -9.557284f,  -9.704856f,  -9.852428f, -10.f, -20.f,
-                                      -19.856981f, -19.713963f, -19.570944f,   8.896924f,   8.727221f,   8.557517f,   8.387813f,  21.476097f,  21.631475f,  21.786854f,  21.942233f, -11.918438f,
-                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-    // output->printBuffer();
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test6) {
-
-    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
-
-    NDArray expected('c', {2,2,2,4}, {11.612187f,  18.523903f,  -8.671851f, -21.287169f,  10.933372f,  19.145418f,  -9.262139f, -20.715094f,  10.254556f,  19.766932f,  -9.852428f, -20.143019f,   9.57574f,
-                                      20.388447f, -10.442716f, -19.570944f,   8.896924f,  21.009961f, -11.033005f, -18.998869f,   8.218109f,  21.631475f, -11.623294f, -18.426794f,   7.539293f,  22.25299f,
-                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
-    input.linspace(0.1, 0.1);
-
-    nd4j::ops::batchnorm op;
-
-    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
-
-    ASSERT_EQ(ND4J_STATUS_OK, results->status());
-
-    auto output = results->at(0);
-
-    ASSERT_TRUE(expected.isSameShapeStrict(*output));
-    ASSERT_TRUE(expected.equalsTo(output));
-
-    delete results;
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests10, batchnorm_test7) {
-
-    NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
-    input2.permutei({0,3,1,2});
-
-    NDArray mean    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
-
-    NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-
-    input1.linspace(-1012, 1);
-    input2.assign(input1);
-
-    nd4j::ops::batchnorm op;
-
-    auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
-    ASSERT_EQ(ND4J_STATUS_OK, res1);
-
-    auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
-    ASSERT_EQ(ND4J_STATUS_OK, res2);
-
-    ASSERT_TRUE(out1.equalsTo(out2));
-}
-
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, bool_broadcast_test_1) {

--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@ -38,6 +38,19 @@ public:
    }
 };

+template <typename T>
+class TypedDeclarableOpsTests13 : public testing::Test {
+public:
+
+    TypedDeclarableOpsTests13() {
+        printf("\n");
+        fflush(stdout);
+    }
+};
+
+typedef ::testing::Types<double, float> TestingTypes;
+TYPED_TEST_CASE(TypedDeclarableOpsTests13, TestingTypes);
+
 TEST_F(DeclarableOpsTests13, test_pow_1) {
    auto x = NDArrayFactory::create<float>('c', {2, 2}, {2.f, 2.f, 2.f, 2.f});
    auto y = NDArrayFactory::create<int>('c', {2}, {3, 3});
@ -1948,3 +1961,289 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
 }


+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test1) {
+
+    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test2) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {4});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {4});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {4});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {4});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
+                                            1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
+
+    input.linspace(0.1, 0.1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(1.);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test3) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
+                                            0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test4) {
+
+    auto input    = NDArrayFactory::create<TypeParam>('c', {2,3,4});
+    auto mean     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
+    auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
+    auto gamma    = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
+    auto beta     = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
+
+    auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
+                                            0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
+
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test5) {
+
+    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,4,2,2}, { 11.612187f,  11.442483f,  11.272779f,  11.103076f,  18.990039f,  19.145418f,  19.300796f,  19.456175f,  -9.557284f,  -9.704856f,  -9.852428f, -10.f, -20.f,
+                                      -19.856981f, -19.713963f, -19.570944f,   8.896924f,   8.727221f,   8.557517f,   8.387813f,  21.476097f,  21.631475f,  21.786854f,  21.942233f, -11.918438f,
+                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test6) {
+
+    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,2,2,4}, {11.612187f,  18.523903f,  -8.671851f, -21.287169f,  10.933372f,  19.145418f,  -9.262139f, -20.715094f,  10.254556f,  19.766932f,  -9.852428f, -20.143019f,   9.57574f,
+                                      20.388447f, -10.442716f, -19.570944f,   8.896924f,  21.009961f, -11.033005f, -18.998869f,   8.218109f,  21.631475f, -11.623294f, -18.426794f,   7.539293f,  22.25299f,
+                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
+    input.linspace(0.1, 0.1);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShapeStrict(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test7) {
+
+    NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+    NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
+    input2.permutei({0,3,1,2});
+
+    NDArray mean    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
+
+    NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+    NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+
+    input1.linspace(-1012, 1);
+    input2.assign(input1);
+
+    nd4j::ops::batchnorm op;
+
+    auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, res1);
+
+    auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, res2);
+
+    ASSERT_TRUE(out1.equalsTo(out2));
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test8) {
+
+    NDArray input('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
+
+    NDArray mean    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,3,4,5}, {-105.019394, -103.322357, -101.625313, -99.928276, -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922,
+                        -82.957886, -81.260841, -79.563805, -77.866768, -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414,
+                        -60.896374, -59.199333, -57.502296, -55.805256, -54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902,
+                        -38.834862, -37.137825, -35.440784, -33.743744, -32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392,
+                        -16.773354, -15.076314, -13.379274, -11.682236, -9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157,
+                        6.985196, 8.682236, 10.379274, 12.076314, 13.773354, 15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744,
+                        32.440784, 34.137825, 35.834862, 37.531902, 39.228943, 40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333,
+                        57.896374, 59.593414, 61.290451, 62.987488, 64.684532, 66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922,
+                        83.351959, 85.049004, 86.746040, 88.443077, 90.140121, 91.837158, 93.534195, 95.231239, 96.928276}, nd4j::DataType::FLOAT32);
+
+    input.linspace(-60, 1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(-1.5);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(expected.isSameShape(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batchnorm_test9) {
+
+    NDArray input('c', {2,3,3,3,3}, nd4j::DataType::FLOAT32);
+
+    NDArray mean    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+
+    NDArray expected('c', {2,3,3,3,3}, {-138.960175, -137.263138, -135.566101, -133.869064, -132.172028, -130.474976, -128.777954, -127.080902, -125.383865, -123.686829, -121.989784, -120.292747,
+                            -118.595711, -116.898666, -115.201630, -113.504593, -111.807549, -110.110512, -108.413475, -106.716431, -105.019394, -103.322357, -101.625313, -99.928276,
+                            -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922, -82.957886, -81.260841, -79.563805, -77.866768,
+                            -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414, -60.896374, -59.199333, -57.502296, -55.805256,
+                            -54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902, -38.834862, -37.137825, -35.440784, -33.743744,
+                            -32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392, -16.773354, -15.076314, -13.379274, -11.682236,
+                            -9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157, 6.985196, 8.682236, 10.379274, 12.076314, 13.773354,
+                            15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744, 32.440784, 34.137825, 35.834862, 37.531902, 39.228943,
+                            40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333, 57.896374, 59.593414, 61.290451, 62.987488, 64.684532,
+                            66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922, 83.351959, 85.049004, 86.746040, 88.443077, 90.140121,
+                            91.837158, 93.534195, 95.231239, 96.928276, 98.625313, 100.322357, 102.019394, 103.716431, 105.413475, 107.110512, 108.807549, 110.504593, 112.201630, 113.898666,
+                            115.595711, 117.292747, 118.989784, 120.686829, 122.383865, 124.080902, 125.777946, 127.474976, 129.172028, 130.869064, 132.566101, 134.263138}, nd4j::DataType::FLOAT32);
+
+    input.linspace(-80, 1);
+    mean.assign(1.);
+    variance.assign(0.5);
+    gamma.assign(1.2);
+    beta.assign(-1.5);
+
+    nd4j::ops::batchnorm op;
+
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3,4});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(expected.isSameShape(*output));
+    ASSERT_TRUE(expected.equalsTo(output));
+
+    delete results;
+}
--- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
@ -45,26 +45,26 @@ static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*>
 TEST_F(MklDnnTests, helpers_includer) {
    // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
 #ifdef HAVE_MKLDNN
-    nd4j::ops::platforms::PLATFORM_conv2d conv2d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp conv2d_bp;
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;

-    nd4j::ops::platforms::PLATFORM_conv2d conv3d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp conv3d_bp;
+    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv3d;
+    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv3d_bp;

-    nd4j::ops::platforms::PLATFORM_avgpool2d avgpool2d;
-    nd4j::ops::platforms::PLATFORM_avgpool2d_bp avgpool2d_bp;
+    nd4j::ops::platforms::PLATFORM_avgpool2d_ENGINE_CPU avgpool2d;
+    nd4j::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CPU avgpool2d_bp;

-    nd4j::ops::platforms::PLATFORM_maxpool2d maxpool2d;
-    nd4j::ops::platforms::PLATFORM_maxpool2d_bp maxpool2d_bp;
+    nd4j::ops::platforms::PLATFORM_maxpool2d_ENGINE_CPU maxpool2d;
+    nd4j::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CPU maxpool2d_bp;

-    nd4j::ops::platforms::PLATFORM_avgpool3dnew avgpool3d;
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp avgpool3d_bp;
+    nd4j::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CPU avgpool3d;
+    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CPU avgpool3d_bp;

-    nd4j::ops::platforms::PLATFORM_maxpool3dnew maxpool3d;
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp maxpool3d_bp;
+    nd4j::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CPU maxpool3d;
+    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;

-    nd4j::ops::platforms::PLATFORM_lrn lrn;
-    nd4j::ops::platforms::PLATFORM_batchnorm batchnorm;
+    nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
+    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;

    printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm});
 #endif
--- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
@ -247,8 +247,10 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) {
    auto res = cudaStreamSynchronize(*stream);
    ASSERT_EQ(0, res);
    //double* localBuffer = ;
+    z.syncToHost();
    cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost);
    res = cudaStreamSynchronize(*stream);
+    z.tickWriteHost();
    ASSERT_EQ(0, res);

    //
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@ -150,7 +150,7 @@ if ("${EXPERIMENTAL}" STREQUAL "yes")
 endif()

 # tests are always compiled with all ops included
-SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true")
+SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DDEFAULT_ENGINE=samediff::ENGINE_CPU")

 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
    # using Clang
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
@ -38,6 +38,7 @@ import org.bytedeco.javacpp.tools.InfoMapper;
                        "array/ConstantDataBuffer.h",
                        "array/TadPack.h",
                        "execution/ErrorReference.h",
+                        "execution/Engine.h",
                        "memory/MemoryType.h",
                        "Environment.h",
                        "types/utf8string.h",
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
@ -41,6 +41,7 @@ import java.util.Scanner;
                                              "array/ConstantDescriptor.h",
                                              "array/TadPack.h",
                                              "execution/ErrorReference.h",
+                                              "execution/Engine.h",
                                              "Environment.h",
                                              "types/utf8string.h",
                                              "NativeOps.h",