Merge pull request #8803 from KonduitAI/master

Development updates
2020-03-24 19:14:07 +11:00 · 2020-03-24 19:14:07 +11:00 · 44394da4b8
commit 44394da4b8
parent 7c05928185 4e8f3a025f
362 changed files with 13077 additions and 5439 deletions
--- a/datavec/datavec-python/src/main/java/org/datavec/python/PythonObject.java
+++ b/datavec/datavec-python/src/main/java/org/datavec/python/PythonObject.java
@ -77,7 +77,7 @@ public class PythonObject {

        long address = bp.address();
        long size = bp.capacity();
-        NumpyArray npArr = NumpyArray.builder().address(address).shape(new long[]{size}).strides(new long[]{1}).dtype(DataType.BYTE).build();
+        NumpyArray npArr = NumpyArray.builder().address(address).shape(new long[]{size}).strides(new long[]{1}).dtype(DataType.INT8).build();
        nativePythonObject = Python.memoryview(new PythonObject(npArr)).nativePythonObject;
    }

@ -320,20 +320,23 @@ public class PythonObject {
    public NumpyArray toNumpy() throws PythonException{
        PyObject np = PyImport_ImportModule("numpy");
        PyObject ndarray = PyObject_GetAttrString(np, "ndarray");
-        if (PyObject_IsInstance(nativePythonObject, ndarray) == 0){
+        if (PyObject_IsInstance(nativePythonObject, ndarray) != 1){
            throw new PythonException("Object is not a numpy array! Use Python.ndarray() to convert object to a numpy array.");
        }
        Py_DecRef(ndarray);
        Py_DecRef(np);
+
        Pointer objPtr = new Pointer(nativePythonObject);
        PyArrayObject npArr = new PyArrayObject(objPtr);
        Pointer ptr = PyArray_DATA(npArr);
-        SizeTPointer shapePtr = PyArray_SHAPE(npArr);
        long[] shape = new long[PyArray_NDIM(npArr)];
-        shapePtr.get(shape, 0, shape.length);
-        SizeTPointer stridesPtr = PyArray_STRIDES(npArr);
+        SizeTPointer shapePtr = PyArray_SHAPE(npArr);
+        if (shapePtr != null)
+            shapePtr.get(shape, 0, shape.length);
        long[] strides = new long[shape.length];
-        stridesPtr.get(strides, 0, strides.length);
+        SizeTPointer stridesPtr = PyArray_STRIDES(npArr);
+        if (stridesPtr != null)
+            stridesPtr.get(strides, 0, strides.length);
        int npdtype = PyArray_TYPE(npArr);

        DataType dtype;
@ -345,28 +348,27 @@ public class PythonObject {
            case NPY_SHORT:
                dtype = DataType.SHORT; break;
            case NPY_INT:
-                dtype = DataType.INT; break;
+                dtype = DataType.INT32; break;
            case NPY_LONG:
                dtype = DataType.LONG; break;
            case NPY_UINT:
                dtype = DataType.UINT32; break;
            case NPY_BYTE:
-                dtype = DataType.BYTE; break;
+                dtype = DataType.INT8; break;
            case NPY_UBYTE:
-                dtype = DataType.UBYTE; break;
+                dtype = DataType.UINT8; break;
            case NPY_BOOL:
                dtype = DataType.BOOL; break;
            case NPY_HALF:
-                dtype = DataType.HALF; break;
+                dtype = DataType.FLOAT16; break;
            case NPY_LONGLONG:
                dtype = DataType.INT64; break;
            case NPY_USHORT:
                dtype = DataType.UINT16; break;
            case NPY_ULONG:
-                dtype = DataType.UINT64; break;
            case NPY_ULONGLONG:
                dtype = DataType.UINT64; break;
-                default:
+            default:
                    throw new PythonException("Unsupported array data type: " + npdtype);
        }

--- a/datavec/datavec-python/src/test/java/org/datavec/python/ScalarAndArrayTest.java
+++ b/datavec/datavec-python/src/test/java/org/datavec/python/ScalarAndArrayTest.java
@ -0,0 +1,48 @@
+/* ******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.datavec.python;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+
+import static junit.framework.TestCase.assertEquals;
+
+@RunWith(Parameterized.class)
+public class ScalarAndArrayTest {
+
+    @Parameterized.Parameters(name = "{index}: Testing with INDArray={0}")
+    public static INDArray[] data() {
+        return new INDArray[]{
+                Nd4j.scalar(10),
+                Nd4j.ones(10, 10, 10, 10)
+        };
+    }
+
+    private INDArray indArray;
+
+    public ScalarAndArrayTest(INDArray indArray) {
+        this.indArray = indArray;
+    }
+
+    @Test
+    public void testINDArray() throws PythonException {
+        assertEquals(indArray, new PythonObject(indArray).toNumpy().getNd4jArray());
+    }
+}
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/AssertTestsExtendBaseClass.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/AssertTestsExtendBaseClass.java
@ -37,6 +37,11 @@ import static org.junit.Assert.assertEquals;
@Slf4j
 public class AssertTestsExtendBaseClass extends BaseDL4JTest {

+    @Override
+    public long getTimeoutMilliseconds() {
+        return 240000L;
+    }
+
    //Set of classes that are exclusions to the rule (either run manually or have their own logging + timeouts)
    private static final Set<Class<?>> exclusions = new HashSet<>();

--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -79,10 +79,11 @@ if(NOT SD_CUDA)
    if ("${OPENBLAS_PATH}" STREQUAL "")
        #we don't want OpenBLAS on Apple
        if (NOT APPLE)
+            # note: this is not a typo
            set(BLA_VENDOR "OpenBLAS")
        endif()

-        # look around for system blas instead
+        # look around for system blas instead, see: https://cmake.org/cmake/help/latest/module/FindBLAS.html
        find_package(BLAS REQUIRED)
        if (BLAS_FOUND)
            message("Found external BLAS implementation: ${BLAS_LIBRARIES} ")
@ -91,6 +92,7 @@ if(NOT SD_CUDA)
    else()
        # if we have externally provided OPENBLAS_PATH - let's use it
        set(HAVE_OPENBLAS 1)
+        message("Setting openblas")
        include_directories(${OPENBLAS_PATH}/include/)
        link_directories(${OPENBLAS_PATH} ${OPENBLAS_PATH}/lib/)
        set(OPENBLAS_LIBRARIES openblas)
--- a/libnd4j/CMakeLists.txt.mkldnn.in
+++ b/libnd4j/CMakeLists.txt.mkldnn.in
@ -5,7 +5,7 @@ project(mkldnn-download NONE)
 include(ExternalProject)
 ExternalProject_Add(mkldnn
  GIT_REPOSITORY     https://github.com/intel/mkl-dnn.git
-  GIT_TAG           v1.2.1
+  GIT_TAG           v1.2.2
  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build"
  CONFIGURE_COMMAND ""
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -49,7 +49,7 @@ if (SD_IOS_BUILD)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSD_IOS_BUILD=true")
 endif()

-if(WIN32)
+if(WIN32 AND NOT ANDROID)
    get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
 	if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
@ -231,7 +231,11 @@ if(SD_CUDA)
                ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})

-        add_library(${SD_LIBRARY_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
+        # Don't output dynamic linked lib when  a static lib build is specified unless the tests are built
+        if(NOT SD_STATIC_LIB OR SD_BUILD_TESTS)
+            add_library(${SD_LIBRARY_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
+        endif()
+

        if (WIN32)
            message("MSVC runtime for library: ${MSVC_RT_LIB}")
@ -241,7 +245,7 @@ if(SD_CUDA)
        if (SD_BUILD_TESTS OR SD_STATIC_LIB)
            add_library(${SD_LIBRARY_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
            set_property(TARGET ${SD_LIBRARY_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
-            install(TARGETS ${SD_LIBRARY_NAME}static DESTINATION .)
+            install(TARGETS ${SD_LIBRARY_NAME}static  DESTINATION .)
        endif()

        # on windows we want to make sure we use MT or MD, but since we use it in one lib, we must use it everywhere to avoid conflicts
@ -320,14 +324,16 @@ elseif(SD_CPU)
            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
            ${OPS_SOURCES} ${PERF_SOURCES})
    if(IOS)
-        add_library(${SD_LIBRARY_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
+        add_library(${SD_LIBRARY_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>)
    else()
        # static library is built only if we're going to build tests, skip otherwise
        if (SD_BUILD_TESTS OR SD_STATIC_LIB)
            add_library(${SD_LIBRARY_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
        endif()

-        add_library(${SD_LIBRARY_NAME}       SHARED $<TARGET_OBJECTS:nd4jobj>)
+       if(SD_BUILD_TESTS OR NOT SD_STATIC_LIB)
+                add_library(${SD_LIBRARY_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
+        endif()
    endif()

    # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@ -21,6 +21,33 @@ set -eu
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd "$DIR"

+setwindows_msys() {
+  if [[ $KERNEL == *"windows"* ]]; then
+  export CMAKE_COMMAND="$CMAKE_COMMAND -G \"MSYS Makefiles\""
+  fi
+}
+
+setandroid_defaults() {
+  if [[ -z ${ANDROID_NDK:-} ]]; then
+    export ANDROID_NDK=$HOME/Android/android-ndk/
+    echo "No ANDROID_NDK variable set. Setting to default of $ANDROID_NDK"
+    else
+        echo "USING ANDROID NDK $ANDROID_NDK"
+fi
+
+  if [[ -z ${ANDROID_VERSION:-} ]]; then
+    export ANDROID_VERSION=21
+    echo "No ANDROID_VERSION variable set. Setting to default of $ANDROID_VERSION"
+    else
+       echo "USING ANDROID VERSION $ANDROID_VERSION"
+    # android needs static linking
+
+fi
+
+
+}
+
+
 export CMAKE_COMMAND="cmake"
 if which cmake3 &> /dev/null; then
    export CMAKE_COMMAND="cmake3"
@ -57,7 +84,7 @@ VERBOSE_ARG="VERBOSE=1"
 HELPER=
 CHECK_VECTORIZATION="OFF"
 NAME=
-while [[ $# > 0 ]]
+while [[ $# -gt 0 ]]
 do
 key="$1"
 value="${2:-}"
@ -141,7 +168,7 @@ case $key in
            # unknown option
    ;;
 esac
-if [[ $# > 0 ]]; then
+if [[ $# -gt 0 ]]; then
    shift # past argument or value
 fi
 done
@ -190,44 +217,65 @@ case "$OS" in
      if [ -z "$ARCH" ]; then
        ARCH="armv7-a"
      fi
-      export ANDROID_BIN="$ANDROID_NDK/toolchains/arm-linux-androideabi-4.9/prebuilt/$KERNEL/"
+
+      setandroid_defaults
+
+      # Note here for android 32 bit prefix on the binutils is different
+      # See https://developer.android.com/ndk/guides/other_build_systems
+      export ANDROID_BIN="$ANDROID_NDK/toolchains/arm-linux-androideabi/prebuilt/$KERNEL/"
      export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
      export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
-      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-arm/"
+      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-$ANDROID_VERSION/arch-arm/"
      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm.cmake -DSD_ANDROID_BUILD=true"
+      setwindows_msys
    ;;

    android-arm64)
      if [ -z "$ARCH" ]; then
        ARCH="armv8-a"
      fi
+
+      setandroid_defaults
+
+      echo "BUILDING ANDROID ARM with KERNEL $KERNEL"
      export ANDROID_BIN="$ANDROID_NDK/toolchains/aarch64-linux-android-4.9/prebuilt/$KERNEL/"
      export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
      export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
-      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-arm64/"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm64.cmake -DSD_ANDROID_BUILD=true"
+      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-$ANDROID_VERSION/arch-arm64/"
+      export CMAKE_COMMAND="$CMAKE_COMMAND  -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm64.cmake -DSD_ANDROID_BUILD=true"
+      setwindows_msys
    ;;

    android-x86)
      if [ -z "$ARCH" ]; then
        ARCH="i686"
      fi
-      export ANDROID_BIN="$ANDROID_NDK/toolchains/x86-4.9/prebuilt/$KERNEL/"
+
+      setandroid_defaults
+      export ANDROID_BIN="$ANDROID_NDK/toolchains/arm-linux-androideabi-4.9/prebuilt/$KERNEL/"
      export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
      export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
-      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-x86/"
+      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-$ANDROID_VERSION/arch-x86/"
      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86.cmake -DSD_ANDROID_BUILD=true"
+      setwindows_msys
    ;;

    android-x86_64)
+
      if [ -z "$ARCH" ]; then
        ARCH="x86-64"
      fi
-      export ANDROID_BIN="$ANDROID_NDK/toolchains/x86_64-4.9/prebuilt/$KERNEL/"
+      echo "BUILDING ANDROID x86_64"
+
+      setandroid_defaults
+
+
+      export ANDROID_BIN="$ANDROID_NDK/toolchains/arm-linux-androideabi-4.9/prebuilt/$KERNEL/"
      export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
      export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
-      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-x86_64/"
+      export ANDROID_ROOT="$ANDROID_NDK/platforms/android-$ANDROID_VERSION/arch-x86_64/"
      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86_64.cmake -DSD_ANDROID_BUILD=true"
+      setwindows_msys
    ;;

    ios-x86_64)
@ -400,9 +448,9 @@ if [ -z "$NAME" ]; then
 fi

 if [ "$LIBTYPE" == "dynamic" ]; then
-     SHARED_LIBS_ARG="-DSD_SHARED_LIB=OFF"
+     SHARED_LIBS_ARG="-DSD_SHARED_LIB=ON -DSD_STATIC_LIB=OFF"
     else
-         SHARED_LIBS_ARG="-DSD_SHARED_LIB=ON"
+         SHARED_LIBS_ARG="-DSD_SHARED_LIB=OFF -DSD_STATIC_LIB=ON"
 fi

 if [ "$BUILD" == "release" ]; then
@ -464,7 +512,9 @@ if [ "$CHIP" == "cuda" ] && [ -n "$CHIP_VERSION" ]; then
    esac
 fi

+
 [[ -z ${OPENBLAS_PATH:-} ]] && OPENBLAS_PATH=""
+OPENBLAS_PATH="${OPENBLAS_PATH//\\//}"

 if [[ -n "${BUILD_PATH:-}" ]]; then
    PREVIFS="$IFS"
@ -537,7 +587,7 @@ echo CHECK_VECTORIZATION = "$CHECK_VECTORIZATION"
 echo HELPERS = "$HELPERS"
 mkbuilddir
 pwd
-eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DSD_CHECK_VECTORIZATION="${CHECK_VECTORIZATION}"  $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+eval "$CMAKE_COMMAND"  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DSD_CHECK_VECTORIZATION="${CHECK_VECTORIZATION}"  "$HELPERS" "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..

 if [ "$PARALLEL" == "true" ]; then
    MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
@ -551,9 +601,10 @@ if [ "$CHECK_VECTORIZATION" == "ON" ]; then
 if [ "$MAKE_COMMAND" == "make" ]; then
    MAKE_ARGUMENTS="$MAKE_ARGUMENTS --output-sync=target"
 fi
+
 exec 3>&1
-eval $MAKE_COMMAND $MAKE_ARGUMENTS 2>&1 >&3 3>&-  | python3 ../../auto_vectorization/auto_vect.py   && cd ../../..
+eval "$MAKE_COMMAND" "$MAKE_ARGUMENTS" 2>&1 >&3 3>&-  | python3 ../../auto_vectorization/auto_vect.py   && cd ../../..
 exec 3>&- 
 else
-eval $MAKE_COMMAND $MAKE_ARGUMENTS  && cd ../../..
+eval "$MAKE_COMMAND" "$MAKE_ARGUMENTS"  && cd ../../..
 fi
--- a/libnd4j/cmake/android-arm.cmake
+++ b/libnd4j/cmake/android-arm.cmake
@ -1,27 +1,22 @@
 # CMake toolchain to build for Android 5.0 or newer. Sample usage:
 #
-# ANDROID_BIN="/path/to/android-ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/" \
-# ANDROID_CPP="/path/to/android-ndk/sources/cxx-stl/llvm-libc++/" \
-# ANDROID_CC="/path/to/android-ndk/toolchains/llvm/prebuilt/linux-x86_64/bin/clang" \
-# ANDROID_ROOT="/path/to/android-ndk/platforms/android-21/arch-arm/" \
-# cmake -DCMAKE_TOOLCHAIN_FILE=android-arm.cmake -DCMAKE_INSTALL_PREFIX=..
-#
-# If you really need to use libnd4j on a CPU with no FPU, replace "libs/armeabi-v7a" by "libs/armeabi" and
-# "-march=armv7-a -mfloat-abi=softfp -mfpu=vfpv3-d16" with "-march=armv5te -mtune=xscale -msoft-float"
+set(CMAKE_SYSTEM_NAME Android)
+set(CMAKE_ANDROID_ARCH_ABI armeabi-v7a)
+set(CMAKE_ANDROID_NDK "$ENV{ANDROID_NDK}")
+set(CMAKE_ANDROID_STL_TYPE c++_shared)
+set(CMAKE_SYSTEM_VERSION  "$ENV{ANDROID_VERSION}")
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)

-set(CMAKE_SYSTEM_NAME UnixPaths)
-set(CMAKE_SYSTEM_PROCESSOR arm)
 set(ANDROID TRUE)
+if (WIN32)
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}.exe")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++.exe")
+   else()
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")
+endif (WIN32)

-set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
-set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")

-set(CMAKE_C_LINK_EXECUTABLE    "<CMAKE_C_COMPILER>   <FLAGS> <CMAKE_C_LINK_FLAGS>   <LINK_FLAGS> <OBJECTS> -target armv7-none-linux-androideabi -Wl,--fix-cortex-a8 -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_LINK_EXECUTABLE  "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -target armv7-none-linux-androideabi -Wl,--fix-cortex-a8 -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/armeabi-v7a/ -nostdlib++ -lc++_static -lc++abi -landroid_support -lm -lc")

-set(CMAKE_C_CREATE_SHARED_LIBRARY    "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -target armv7-none-linux-androideabi -Wl,--fix-cortex-a8 -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_CREATE_SHARED_LIBRARY  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -target armv7-none-linux-androideabi -Wl,--fix-cortex-a8 -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/armeabi-v7a/ -nostdlib++ -lc++_static -lc++abi -landroid_support -lm -lc")
+add_definitions(-D__ANDROID_API__=$ENV{ANDROID_VERSION} -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target  armv7a-linux-androideabi)

-add_definitions(-D__ANDROID_API__=21 -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target armv7-none-linux-androideabi -march=armv7-a -mfloat-abi=softfp -mfpu=vfpv3-d16)
-
-include_directories("$ENV{ANDROID_CPP}/include/" "$ENV{ANDROID_CPP}/../llvm-libc++abi/include/" "$ENV{ANDROID_NDK}/sources/android/support/include/" "$ENV{ANDROID_CPP}/libs/armeabi-v7a/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/arm-linux-androideabi/" "$ENV{ANDROID_ROOT}/usr/include/")
--- a/libnd4j/cmake/android-arm64.cmake
+++ b/libnd4j/cmake/android-arm64.cmake
@ -1,24 +1,22 @@
 # CMake toolchain to build for Android 5.0 or newer. Sample usage:
 #
-# ANDROID_BIN="/path/to/android-ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/" \
-# ANDROID_CPP="/path/to/android-ndk/sources/cxx-stl/llvm-libc++/" \
-# ANDROID_CC="/path/to/android-ndk/toolchains/llvm/prebuilt/linux-x86_64/bin/clang" \
-# ANDROID_ROOT="/path/to/android-ndk/platforms/android-21/arch-arm64/" \
-# cmake -DCMAKE_TOOLCHAIN_FILE=android-arm64.cmake -DCMAKE_INSTALL_PREFIX=..
+set(CMAKE_SYSTEM_NAME Android)
+set(CMAKE_ANDROID_ARCH_ABI arm64-v8a)
+set(CMAKE_ANDROID_NDK "$ENV{ANDROID_NDK}")
+set(CMAKE_ANDROID_STL_TYPE c++_shared)
+set(CMAKE_SYSTEM_VERSION  "$ENV{ANDROID_VERSION}")
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)

-set(CMAKE_SYSTEM_NAME UnixPaths)
-set(CMAKE_SYSTEM_PROCESSOR arm64)
 set(ANDROID TRUE)
+if (WIN32)
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}.exe")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++.exe")
+   else()
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")
+endif (WIN32)

-set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
-set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")

-set(CMAKE_C_LINK_EXECUTABLE    "<CMAKE_C_COMPILER>   <FLAGS> <CMAKE_C_LINK_FLAGS>   <LINK_FLAGS> <OBJECTS> -target aarch64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_LINK_EXECUTABLE  "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -target aarch64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/arm64-v8a/ -nostdlib++ -lc++_static -lc++abi -lm -lc")

-set(CMAKE_C_CREATE_SHARED_LIBRARY    "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -target aarch64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_CREATE_SHARED_LIBRARY  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -target aarch64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/arm64-v8a/ -nostdlib++ -lc++_static -lc++abi -lm -lc")
+add_definitions(-D__ANDROID_API__=$ENV{ANDROID_VERSION} -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target aarch64-none-linux-android -march=armv8-a)

-add_definitions(-D__ANDROID_API__=21 -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target aarch64-none-linux-android -march=armv8-a)
-
-include_directories("$ENV{ANDROID_CPP}/include/" "$ENV{ANDROID_CPP}/../llvm-libc++abi/include/" "$ENV{ANDROID_NDK}/sources/android/support/include/" "$ENV{ANDROID_CPP}/libs/arm64-v8a/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/aarch64-linux-android/" "$ENV{ANDROID_ROOT}/usr/include/")
--- a/libnd4j/cmake/android-x86.cmake
+++ b/libnd4j/cmake/android-x86.cmake
@ -1,24 +1,22 @@
 # CMake toolchain to build for Android 5.0 or newer. Sample usage:
 #
-# ANDROID_BIN="/path/to/android-ndk/toolchains/x86-4.9/prebuilt/linux-x86_64/" \
-# ANDROID_CPP="/path/to/android-ndk/sources/cxx-stl/llvm-libc++/" \
-# ANDROID_CC="/path/to/android-ndk/toolchains/llvm/prebuilt/linux-x86_64/bin/clang" \
-# ANDROID_ROOT="/path/to/android-ndk/platforms/android-21/arch-x86/" \
-# cmake -DCMAKE_TOOLCHAIN_FILE=android-x86.cmake -DCMAKE_INSTALL_PREFIX=..
+set(CMAKE_SYSTEM_NAME Android)
+set(CMAKE_ANDROID_ARCH_ABI x86)
+set(CMAKE_ANDROID_NDK "$ENV{ANDROID_NDK}")
+set(CMAKE_ANDROID_STL_TYPE c++_shared)
+set(CMAKE_SYSTEM_VERSION  "$ENV{ANDROID_VERSION}")
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)

-set(CMAKE_SYSTEM_NAME UnixPaths)
-set(CMAKE_SYSTEM_PROCESSOR atom)
 set(ANDROID TRUE)
+if (WIN32)
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}.exe")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++.exe")
+   else()
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")
+endif (WIN32)

-set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
-set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")

-set(CMAKE_C_LINK_EXECUTABLE    "<CMAKE_C_COMPILER>   <FLAGS> <CMAKE_C_LINK_FLAGS>   <LINK_FLAGS> <OBJECTS> -target i686-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_LINK_EXECUTABLE  "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -target i686-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/x86/ -nostdlib++ -lc++_static -lc++abi -landroid_support -lm -lc")

-set(CMAKE_C_CREATE_SHARED_LIBRARY    "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -target i686-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_CREATE_SHARED_LIBRARY  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -target i686-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/x86/ -nostdlib++ -lc++_static -lc++abi -landroid_support -lm -lc")
+add_definitions(-D__ANDROID_API__=$ENV{ANDROID_VERSION} -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target i686-linux-android)

-add_definitions(-D__ANDROID_API__=21 -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target i686-none-linux-android -march=i686 -mtune=atom -mssse3 -mfpmath=sse)
-
-include_directories("$ENV{ANDROID_CPP}/include/" "$ENV{ANDROID_CPP}/../llvm-libc++abi/include/" "$ENV{ANDROID_NDK}/sources/android/support/include/" "$ENV{ANDROID_CPP}/libs/x86/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/i686-linux-android/" "$ENV{ANDROID_ROOT}/usr/include/")
--- a/libnd4j/cmake/android-x86_64.cmake
+++ b/libnd4j/cmake/android-x86_64.cmake
@ -1,24 +1,21 @@
 # CMake toolchain to build for Android 5.0 or newer. Sample usage:
-#
-# ANDROID_BIN="/path/to/android-ndk/toolchains/x86_64-4.9/prebuilt/linux-x86_64/" \
-# ANDROID_CPP="/path/to/android-ndk/sources/cxx-stl/llvm-libc++/" \
-# ANDROID_CC="/path/to/android-ndk/toolchains/llvm/prebuilt/linux-x86_64/bin/clang" \
-# ANDROID_ROOT="/path/to/android-ndk/platforms/android-21/arch-x86_64/" \
-# cmake -DCMAKE_TOOLCHAIN_FILE=android-x86_64.cmake -DCMAKE_INSTALL_PREFIX=..

-set(CMAKE_SYSTEM_NAME UnixPaths)
-set(CMAKE_SYSTEM_PROCESSOR atom64)
+set(CMAKE_SYSTEM_NAME Android)
+set(CMAKE_ANDROID_ARCH_ABI x86_64)
+set(CMAKE_ANDROID_NDK "$ENV{ANDROID_NDK}")
+set(CMAKE_ANDROID_STL_TYPE c++_shared)
+set(CMAKE_SYSTEM_VERSION  "$ENV{ANDROID_VERSION}")
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)
+
 set(ANDROID TRUE)
+if (WIN32)
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}.exe")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++.exe")
+   else()
+   set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
+   set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")
+endif (WIN32)

-set(CMAKE_C_COMPILER   "$ENV{ANDROID_CC}")
-set(CMAKE_CXX_COMPILER "$ENV{ANDROID_CC}++")

-set(CMAKE_C_LINK_EXECUTABLE    "<CMAKE_C_COMPILER>   <FLAGS> <CMAKE_C_LINK_FLAGS>   <LINK_FLAGS> <OBJECTS> -target x86_64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_LINK_EXECUTABLE  "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -target x86_64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/x86_64/ -nostdlib++ -lc++_static -lc++abi -lm -lc")

-set(CMAKE_C_CREATE_SHARED_LIBRARY    "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -target x86_64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -lm -lc")
-set(CMAKE_CXX_CREATE_SHARED_LIBRARY  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -target x86_64-none-linux-android -Wl,--no-undefined -z text -o <TARGET> <OBJECTS> <LINK_LIBRARIES> -gcc-toolchain $ENV{ANDROID_BIN} --sysroot=$ENV{ANDROID_ROOT} -L$ENV{ANDROID_CPP}/libs/x86_64/ -nostdlib++ -lc++_static -lc++abi -lm -lc")
-
-add_definitions(-D__ANDROID_API__=21 -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target x86_64-none-linux-android -march=x86-64 -mtune=atom)
-
-include_directories("$ENV{ANDROID_CPP}/include/" "$ENV{ANDROID_CPP}/../llvm-libc++abi/include/" "$ENV{ANDROID_NDK}/sources/android/support/include/" "$ENV{ANDROID_CPP}/libs/x86_64/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/" "$ENV{ANDROID_NDK}/sysroot/usr/include/x86_64-linux-android/" "$ENV{ANDROID_ROOT}/usr/include/")
+add_definitions(-D__ANDROID_API__=$ENV{ANDROID_VERSION} -DANDROID -fPIC -ffunction-sections -funwind-tables -fstack-protector-strong -target x86_64-none-linux-android)
--- a/libnd4j/include/array/NDArray.h
+++ b/libnd4j/include/array/NDArray.h
@ -277,13 +277,13 @@ namespace sd {
        /**
 		*  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
        */
-		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool nullify = true);

        /**
        *  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to be zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
        *  set dtype as array type
        */
-        NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool nullify = true);

        /**
        *  this constructor creates new array using shape information contained in vector argument
--- a/libnd4j/include/array/NDArray.hXX
+++ b/libnd4j/include/array/NDArray.hXX
@ -143,7 +143,7 @@ NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &sh

 ////////////////////////////////////////////////////////////////////////
 // creates new NDArray using shape information from "shapeInfo" array, set all elements in new array to be zeros
-NDArray::NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides, sd::LaunchContext * context) {
+NDArray::NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides, sd::LaunchContext * context, const bool nullify) {

    if (shapeInfo == nullptr)
        throw std::runtime_error("NDArray constructor: can't be initalized without shapeinfo");
@ -161,7 +161,9 @@ NDArray::NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyS

    if (!isEmpty()) {
        _buffer = std::make_shared<DataBuffer>(lengthOf() * sizeOfT(), dtype, getContext()->getWorkspace());
-        _buffer->setToZeroBuffers();
+
+        if (nullify)
+            _buffer->setToZeroBuffers();
    }
 }

@ -213,7 +215,7 @@ NDArray::NDArray(sd::LaunchContext * context) {

 ////////////////////////////////////////////////////////////////////////
 // creates new NDArray using shape information from "shapeInfo" array, set all elements in new array to be zeros, set dtype as array type
-NDArray::NDArray(Nd4jLong* shapeInfo, const bool copyStrides, sd::LaunchContext * context):
+NDArray::NDArray(Nd4jLong* shapeInfo, const bool copyStrides, sd::LaunchContext * context, const bool nullify):
        NDArray(shapeInfo, ArrayOptions::dataType(shapeInfo), copyStrides, context) {
 }

@ -3339,9 +3341,6 @@ void NDArray::nullify() {
    if (isEmpty())
        return;

-    if (isS())
-        throw std::runtime_error("NDArray::nullify: can't nullify string array");
-
    if (isView() || ews() != 1)
        assign(0);
    else
--- a/libnd4j/include/execution/LaunchContext.h
+++ b/libnd4j/include/execution/LaunchContext.h
@ -54,6 +54,8 @@ class ND4J_EXPORT LaunchContext {
        static std::vector<std::shared_ptr<LaunchContext>> _contexts;
        static std::mutex _mutex;

+        static MAP_IMPL<int, std::mutex*> _deviceMutexes;
+
        // used for MKLDNN
        void *_engine = nullptr;

@ -93,7 +95,6 @@ class ND4J_EXPORT LaunchContext {
 		void setCudaSpecialStream(cudaStream_t* cudaStream);
 		void setCublasHandle(void *handle);

-
 #endif // JCPP

 #endif // CUDA
@ -111,6 +112,12 @@ class ND4J_EXPORT LaunchContext {
    	void setDeviceID(int deviceID) { _deviceID = deviceID; }
        sd::ErrorReference* errorReference();

+#ifndef __JAVACPP_HACK__
+    	// this method returns mutex shared between all threads that use the same device
+    	static std::mutex* deviceMutex();
+
+#endif
+
    	static bool isInitialized();
    	static void releaseBuffers();

--- a/libnd4j/include/execution/cpu/LaunchContext.cpp
+++ b/libnd4j/include/execution/cpu/LaunchContext.cpp
@ -19,6 +19,7 @@
 //

 #include <execution/LaunchContext.h>
+#include <execution/AffinityManager.h>
 #include <helpers/logger.h>
 #include <exceptions/cuda_exception.h>
 #include <thread>
@ -42,6 +43,8 @@ namespace sd {
    }

    std::vector<std::shared_ptr<LaunchContext>> LaunchContext::_contexts = std::vector<std::shared_ptr<LaunchContext>>();
+    MAP_IMPL<int, std::mutex*> LaunchContext::_deviceMutexes;
+    std::mutex LaunchContext::_mutex;

 ////////////////////////////////////////////////////////////////////////
    LaunchContext::LaunchContext() {
@ -68,6 +71,10 @@ namespace sd {
        return LaunchContext::_contexts[0].get();
    }

+    std::mutex* LaunchContext::deviceMutex() {
+        return &_mutex;
+    }
+
    void LaunchContext::swapContextBuffers(ContextBuffers &buffers) {
        //
    }
--- a/libnd4j/include/execution/cuda/LaunchContext.cu
+++ b/libnd4j/include/execution/cuda/LaunchContext.cu
@ -31,6 +31,7 @@ namespace sd {

    std::vector<std::shared_ptr<LaunchContext>> LaunchContext::_contexts = std::vector<std::shared_ptr<LaunchContext>>();
    std::mutex LaunchContext::_mutex;
+    MAP_IMPL<int, std::mutex*> LaunchContext::_deviceMutexes;

 ////////////////////////////////////////////////////////////////////////
 LaunchContext::LaunchContext(cudaStream_t *cudaStream, cudaStream_t& specialCudaStream, void* reductionPointer, void* scalarPointer, int* allocationPointer)  {
@ -44,6 +45,11 @@ LaunchContext::LaunchContext(cudaStream_t *cudaStream, cudaStream_t& specialCuda
 	_isAllocated = false;
 }

+    std::mutex* LaunchContext::deviceMutex() {
+        auto deviceId = AffinityManager::currentDeviceId();
+        return _deviceMutexes[deviceId];
+    }
+
 LaunchContext::~LaunchContext() {
    if (_isAllocated) {

@ -85,6 +91,8 @@ LaunchContext::LaunchContext() {

            _contexts.resize(numDevices);
            for (int e = 0; e < numDevices; e++) {
+                _deviceMutexes[e] = new std::mutex();
+
                AffinityManager::setCurrentNativeDevice(e);

                LaunchContext::_contexts[e] = std::make_shared<LaunchContext>();
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@ -252,6 +252,8 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
    const bool typeIntFloat  = AB  && aType == DataType::INT8 && cType == DataType::FLOAT32 && major >= 6;
    const bool typeHalfFloat = AB  && aType == DataType::HALF && cType == DataType::FLOAT32  && major >= 6;

+    std::lock_guard<std::mutex> lock(*LaunchContext::deviceMutex());
+
    auto handle = reinterpret_cast<cublasHandle_t *>(A->getContext()->getCublasHandle());
    auto stream = A->getContext()->getCudaStream();

@ -394,6 +396,8 @@ NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, sd::NDArray* Y,
    const bool typeDouble = AXY && aType == DataType::DOUBLE;
    const bool typeFloat  = AXY && aType == DataType::FLOAT32;

+    std::lock_guard<std::mutex> lock(*LaunchContext::deviceMutex());
+
    auto handle = reinterpret_cast<cublasHandle_t *>(A->getContext()->getCublasHandle());
    auto stream = A->getContext()->getCudaStream();

--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@ -4076,7 +4076,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap

    // *** FIRST STAGE - exclude unity dimensions from oldShapeInfo and newShapeInfo (if such are present of course), since they don't affect on strides evaluation, however they complicate code

-    // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
+    // FIXME - indeed we don't need to allocate so large memory amount (4*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
    Nd4jLong tempBuffer[4*MAX_RANK];
    Nd4jLong *oldShape = tempBuffer, *newShape = tempBuffer + 2*MAX_RANK, *oldStrides,  *newStrides;

--- a/libnd4j/include/ops/declarable/CustomOperations.h
+++ b/libnd4j/include/ops/declarable/CustomOperations.h
@ -45,6 +45,7 @@
 #include <ops/declarable/headers/util.h>
 #include <ops/declarable/headers/BarnesHutTsne.h>
 #include <ops/declarable/headers/images.h>
+#include <ops/declarable/headers/updaters.h>
 #include <system/dll.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
--- a/libnd4j/include/ops/declarable/DeclarableOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableOp.h
@ -106,6 +106,7 @@ namespace sd {
            void storeResult(Context &block, int outputNumber, NDArray& array);
            void storeResult(Context &block, int outputNumber, NDArray* array);
            sd::NDArray* getZ(Context& block, int inputId = 0);
+            sd::NDArray* getNullifiedZ(Context& block, int inputId = 0);

            /**
            *   This method pre-allocates NDArrays for Op output, in case they are not available at op execution time
--- a/libnd4j/include/ops/declarable/PlatformHelper.h
+++ b/libnd4j/include/ops/declarable/PlatformHelper.h
@ -77,7 +77,15 @@ namespace sd {
                 * @param inputId
                 * @return
                 */
-                sd::NDArray *getZ(graph::Context &ctx, int inputId);
+                sd::NDArray* getZ(graph::Context &ctx, int inputId);
+
+                /**
+                 * Helper method, needed for compatibility with DeclarableOp macros
+                 * @param ctx
+                 * @param inputId
+                 * @return
+                 */
+                sd::NDArray* getNullifiedZ(graph::Context &ctx, int inputId);
            };
        }
    }
--- a/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp
@ -30,7 +30,7 @@ namespace sd {
        CUSTOM_OP_IMPL(bits_hamming_distance, 2, 1, true, 0, 0) {
            auto x = INPUT_VARIABLE(0);
            auto y = INPUT_VARIABLE(1);
-            auto output = OUTPUT_VARIABLE(0);
+            auto output = OUTPUT_NULLIFIED(0);

            REQUIRE_TRUE(x->lengthOf() == y->lengthOf(), 0, "bits_hamming_distance: both arguments must have the same length");
            REQUIRE_TRUE(x->dataType() == y->dataType(), 0, "bits_hamming_distance: both arguments must have the same data type");
--- a/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp
@ -32,7 +32,7 @@ namespace sd {
            auto values = INPUT_VARIABLE(2);
            NDArray *def = nullptr;

-            auto output = OUTPUT_VARIABLE(0);
+            auto output = OUTPUT_NULLIFIED(0);

            if (block.width() > 3)
                def = INPUT_VARIABLE(3);
--- a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
@ -30,7 +30,7 @@ namespace sd {
            auto input = INPUT_VARIABLE(0);
            auto delim = INPUT_VARIABLE(1);

-            auto indices = OUTPUT_VARIABLE(0);
+            auto indices = OUTPUT_NULLIFIED(0);
            auto values = OUTPUT_VARIABLE(1);

            auto d = delim->e<std::string>(0);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/eye.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/eye.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp
@ -30,7 +30,7 @@ namespace sd {
        CUSTOM_OP_IMPL(lstsq, 2, 1, false, 0, 0) {
            auto a = INPUT_VARIABLE(0);
            auto b = INPUT_VARIABLE(1);
-            auto z = OUTPUT_VARIABLE(0);
+            auto z = OUTPUT_NULLIFIED(0);
            bool fastFlag = true;
            double l2_factor = 0.;
            if (block.numB() > 0) {
@ -56,7 +56,7 @@ namespace sd {
        CUSTOM_OP_IMPL(solve_ls, 2, 1, false, 0, 0) {
            auto a = INPUT_VARIABLE(0);
            auto b = INPUT_VARIABLE(1);
-            auto z = OUTPUT_VARIABLE(0);
+            auto z = OUTPUT_NULLIFIED(0);
            bool fastFlag = true;
            double l2_factor = 0.;
            if (block.numB() > 0) {
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp
@ -114,7 +114,7 @@ namespace sd {

        CUSTOM_OP_IMPL(logdet, 1, 1, false, 0, 0) {
            auto input = INPUT_VARIABLE(0);
-            auto output = OUTPUT_VARIABLE(0);
+            auto output = OUTPUT_NULLIFIED(0);

            REQUIRE_TRUE(input->rankOf() >=2, 0, "logdet: The rank of input array should not less than 2, but %i is given", input->rankOf());
            REQUIRE_TRUE(input->sizeAt(-1) == input->sizeAt(-2), 0, "logdet: The last two dimmensions should be equal, but %i and %i are given", input->sizeAt(-1), input->sizeAt(-2));
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/trace.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/trace.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/tri.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/tri.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/triu.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/triu.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/crelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/crelu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/cube.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/cube.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/elu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/elu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/hardsigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/hardsigmoid.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/hardtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/hardtanh.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/identity.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/identity.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/identity_n.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/identity_n.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/lrelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/lrelu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/prelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/prelu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/rationaltanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/rationaltanh.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/rectifiedtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/rectifiedtanh.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/relu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/relu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/relu6.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/relu6.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/selu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/selu.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/sigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/sigmoid.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/softplus.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/softplus.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/softsign.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/softsign.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/tanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/tanh.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/activations/thresholdedrelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/activations/thresholdedrelu.cpp
--- a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp
@ -28,7 +28,7 @@ namespace sd {
    namespace ops {
        CUSTOM_OP_IMPL(col2im, 1, 1, false, 0, 9) {
            auto x = INPUT_VARIABLE(0);
-            auto z = OUTPUT_VARIABLE(0);
+            auto z = OUTPUT_NULLIFIED(0);

            REQUIRE_TRUE(x->rankOf() == 6, 0, "col2im input should be 6D, but got %i instead", x->rankOf());
            REQUIRE_TRUE(z->rankOf() == 4, 0, "col2im output should be 4D, but got %i instead", z->rankOf());
@ -45,8 +45,6 @@ namespace sd {
            LaunchContext* ctx = block.launchContext();
            helpers::col2im(*ctx, *x, *z, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX);

-            STORE_RESULT(*z);
-
            return ND4J_STATUS_OK;
        }
        DECLARE_SHAPE_FN(col2im) {
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
@ -34,10 +34,10 @@ namespace ops  {
 CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iW, iC] (NWC) or [bS, iC, iW] (NCW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kW, iC, oC], [oC, iC, kW], [oC, kW, iC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]

-    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oW, oC] (NWC) or [bS, oC, oW] (NCW)
+    auto output  = OUTPUT_NULLIFIED(0);                                   // [bS, oW, oC] (NWC) or [bS, oC, oW] (NCW)

    int kW = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) width
    int sW = INT_ARG(1);                                                        // strides width
@ -45,12 +45,13 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {
    int dW = INT_ARG(3);                                                        // dilations width
    int paddingMode = INT_ARG(4);                                               // 0-VALID, 1-SAME, 2-CAUSAL
    int isNCW       = block.getIArguments()->size() > 5 ? !INT_ARG(5) : 1;      // INT_ARG(4): 0-NCW,  1-NWC
+    int wFormat = block.getIArguments()->size() > 6 ? INT_ARG(6) : 0;           // 0 - [kW, iC, oC], 1 - [oC, iC, kW], 2 - [oC, kW, iC]

    const int rank = 3;
    REQUIRE_TRUE(input->rankOf()   == rank, 0, "CUSTOM CONV1D OP: rank of input array must be equal to %i, but got %i instead !", rank, input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == rank, 0, "CUSTOM CONV1D OP: rank of weights array must be equal to %i, but got %i instead !", rank, weights->rankOf());

-    int indIOioC, indIiW, indWoC(2);
+    int indIOioC, indIiW, indWoC(0 == wFormat ? 2 : 0);
    if(!isNCW) {
        indIOioC = 2; indIiW = 1;
    }
@ -63,7 +64,7 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {
    int iC = input->sizeAt(indIOioC);                 // input channels
    int oC = weights->sizeAt(indWoC);                 // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = 0 == wFormat ? std::vector<Nd4jLong>({kW, iC, oC}) : (1 == wFormat ? std::vector<Nd4jLong>({oC, iC, kW}) : std::vector<Nd4jLong>({oC, kW, iC}));
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV1D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV1D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
@ -83,11 +84,11 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {
    auto weightsReshaped = weights->reshape(weights->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});   // [kW, iC, oC] -> [1, kW, iC, oC]

    sd::ops::conv2d conv2d;
-    const Nd4jStatus status = conv2d.execute({&inputReshaped, &weightsReshaped, bias}, {&outputReshaped}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  !isNCW}, {});
+    const Nd4jStatus status = conv2d.execute({&inputReshaped, &weightsReshaped, bias}, {&outputReshaped}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode, !isNCW, wFormat}, {});
    if (status != ND4J_STATUS_OK)
        return status;

-    // ConvolutionUtils::conv2d(block, &inputReshaped, &weightsReshaped, bias, &outputReshaped, 1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  isNCW);
+    // ConvolutionUtils::conv2d(block, &inputReshaped, &weightsReshaped, bias, &outputReshaped, 1,kW,  1,sW,  0,pW,  1,dW,  paddingMode, isNCW, wFormat);

    return Status::OK();
 }
@ -105,8 +106,9 @@ DECLARE_SHAPE_FN(conv1d) {
    int dW = INT_ARG(3);                                                        // dilations width
    int paddingMode = INT_ARG(4);                                               // 0-VALID, 1-SAME
    int isNCW  = block.getIArguments()->size() > 5 ? !INT_ARG(5) : 1;           // INT_ARG(4): 1-NWC, 0-NCW
+    int wFormat = block.getIArguments()->size() > 6 ? INT_ARG(6) : 0;           // 0 - [kW, iC, oC], 1 - [oC, iC, kW], 2 - [oC, kW, iC]

-    int indIOioC, indIiW, indWoC(2);
+    int indIOioC, indIiW, indWoC(0 == wFormat ? 2 : 0);
    if(!isNCW) {
        indIOioC = 2; indIiW = 1;
    }
@ -123,7 +125,7 @@ DECLARE_SHAPE_FN(conv1d) {
    int iC = inputShapeInfo[indIOioC+1];                   // input channels
    int oC = weightsShapeInfo[indWoC+1];                 // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = 0 == wFormat ? std::vector<Nd4jLong>({kW, iC, oC}) : (1 == wFormat ? std::vector<Nd4jLong>({oC, iC, kW}) : std::vector<Nd4jLong>({oC, kW, iC}));
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV1D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(biasShapeInfo[0] <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM CONV1D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
@ -163,13 +165,13 @@ DECLARE_TYPES(conv1d) {
 CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iW, iC] (NWC) or [bS, iC, iW] (NCW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kW, iC, oC], [oC, iC, kW], [oC, kW, iC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oW, oC] (NWC) or [bS, oC, oW] (NCW), epsilon_next

-    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iW, iC] (NWC) or [bS, iC, iW] (NCW), epsilon
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kW, iC, oC] always
-    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+    auto gradI = OUTPUT_NULLIFIED(0);                                                 // [bS, iW, iC] (NWC) or [bS, iC, iW] (NCW), epsilon
+    auto gradW = OUTPUT_NULLIFIED(1);                                                 // [kW, iC, oC], [oC, iC, kW], [oC, kW, iC]
+    auto gradB = block.width() > 3 ? OUTPUT_NULLIFIED(2) : nullptr;                   // [oC]

    int kW = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) width
    int sW = INT_ARG(1);                                                        // strides width
@ -177,12 +179,14 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
    int dW = INT_ARG(3);                                                        // dilations width
    int paddingMode = INT_ARG(4);                                               // 0-VALID, 1-SAME, 2-CAUSAL
    int isNCW  = block.getIArguments()->size() > 5 ? !INT_ARG(5) : 1;           // INT_ARG(4): 1-NWC, 0-NCW
+    int wFormat = block.getIArguments()->size() > 6 ? INT_ARG(6) : 0;           // 0 - [kW, iC, oC], 1 - [oC, iC, kW], 2 - [oC, kW, iC]

    const int rank = 3;
    REQUIRE_TRUE(input->rankOf()   == rank, 0, "CUSTOM CONV1D_BP OP: rank of input array must be equal to %i, but got %i instead !", rank, input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == rank, 0, "CUSTOM CONV1D_BP OP: rank of weights array must be equal to %i, but got %i instead !", rank, weights->rankOf());
    REQUIRE_TRUE(gradO->rankOf()   == rank, 0, "CUSTOM CONV1D_BP OP: rank of output gradients (next epsilon) array must be equal to %i, but got %i instead !", rank, gradO->rankOf());
-    int indIOioC, indIiW, indWoC(2);
+
+    int indIOioC, indIiW, indWoC(0 == wFormat ? 2 : 0);
    if(!isNCW) {
        indIOioC = 2; indIiW = 1;
    }
@ -199,7 +203,7 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
    ConvolutionUtils::calcOutSizePool2D(trueoH,trueoW, 1,kW, 1,sW, 0,pW, 1,dW, 1,iW, paddingMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoW,  0,indIOioC,indIiW});
-    std::vector<Nd4jLong> expectedWeightsShape = {kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = 0 == wFormat ? std::vector<Nd4jLong>({kW, iC, oC}) : (1 == wFormat ? std::vector<Nd4jLong>({oC, iC, kW}) : std::vector<Nd4jLong>({oC, kW, iC}));
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV1D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV1D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
@ -222,11 +226,11 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
    auto gradWReshaped   = gradW  ->reshape(gradW->ordering(),  {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}, false);// [kW, iC, oC] -> [1, kW, iC, oC]

    sd::ops::conv2d_bp conv2dBP;
-    auto status = conv2dBP.execute({&inputReshaped, &weightsReshaped, bias, &gradOReshaped}, {&gradIReshaped, &gradWReshaped, gradB}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  !isNCW}, {});
+    auto status = conv2dBP.execute({&inputReshaped, &weightsReshaped, bias, &gradOReshaped}, {&gradIReshaped, &gradWReshaped, gradB}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode, !isNCW, wFormat}, {});
    if (status != ND4J_STATUS_OK)
        return status;

-    // ConvolutionUtils::conv2dBP(block, &inputReshaped, &weightsReshaped, bias, &gradOReshaped, &gradIReshaped, &gradWReshaped, gradB, 1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  isNCW);
+    // ConvolutionUtils::conv2dBP(block, &inputReshaped, &weightsReshaped, bias, &gradOReshaped, &gradIReshaped, &gradWReshaped, gradB, 1,kW,  1,sW,  0,pW,  1,dW,  paddingMode, isNCW, wFormat);

    return Status::OK();
 }
@ -235,7 +239,7 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
 DECLARE_SHAPE_FN(conv1d_bp) {

    auto inputShapeInfo   = inputShape->at(0);                                               // [bS, iW, iC] (NWC) or [bS, iC, iW] (NCW)
-    auto weightsShapeInfo = inputShape->at(1);                                               // [kW, iC, oC] always
+    auto weightsShapeInfo = inputShape->at(1);                                               // [kW, iC, oC], [oC, iC, kW], [oC, kW, iC]
    Nd4jLong* biasShapeInfo    = block.width() > 3 ? inputShape->at(2) : nullptr;            // [oC]
    Nd4jLong* gradOShapeInfo   = block.width() > 3 ? inputShape->at(3) : inputShape->at(2);  // [bS, oW, oC] (NWC) or [bS, oC, oW] (NCW), epsilon_next

@ -250,8 +254,9 @@ DECLARE_SHAPE_FN(conv1d_bp) {
    int dW = INT_ARG(3);                                                        // dilations width
    int paddingMode = INT_ARG(4);                                               // 0-VALID, 1-SAME
    int isNCW  = block.getIArguments()->size() > 5 ? !INT_ARG(5) : 1;           // INT_ARG(4): 1-NWC, 0-NCW
+    int wFormat = block.getIArguments()->size() > 6 ? INT_ARG(6) : 0;           // 0 - [kW, iC, oC], 1 - [oC, iC, kW], 2 - [oC, kW, iC]

-    int indIOioC, indIiW, indWoC(2);
+    int indIOioC, indIiW, indWoC(0 == wFormat ? 2 : 0);
    if(!isNCW) {
        indIOioC = 2; indIiW = 1;
    }
@ -268,7 +273,7 @@ DECLARE_SHAPE_FN(conv1d_bp) {
    ConvolutionUtils::calcOutSizePool2D(trueoH,trueoW, 1,kW, 1,sW, 0,pW, 1,dW, 1,iW, paddingMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoW,  0,indIOioC,indIiW});
-    std::vector<Nd4jLong> expectedWeightsShape = {kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = 0 == wFormat ? std::vector<Nd4jLong>({kW, iC, oC}) : (1 == wFormat ? std::vector<Nd4jLong>({oC, iC, kW}) : std::vector<Nd4jLong>({oC, kW, iC}));
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(gradOShapeInfo, expectedGradOShape), 0,  "CUSTOM CONV1D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV1D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp
@ -37,10 +37,10 @@ namespace ops  {
 CUSTOM_OP_IMPL(conv2d, 2, 1, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]

-    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+    auto output  = OUTPUT_NULLIFIED(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)

    int sH = INT_ARG(2);                                                        // strides height
    int sW = INT_ARG(3);                                                        // strides width
@ -49,21 +49,22 @@ CUSTOM_OP_IMPL(conv2d, 2, 1, false, 0, 9) {
    int dH = INT_ARG(6);                                                        // dilations height
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
-    bool isNCHW    = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int isNCHW  = block.getIArguments()->size() > 9  ? !INT_ARG(9) : 1;         // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());

-    ConvolutionUtils::conv2d(block, input, weights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW);
+    ConvolutionUtils::conv2d(block, input, weights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW,wFormat);

    return Status::OK();
 }
@ -73,7 +74,7 @@ CUSTOM_OP_IMPL(conv2d, 2, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(conv2d) {

    auto inputShapeInfo   = inputShape->at(0);                                  // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weightsShapeInfo = inputShape->at(1);                                  // [kH, kW, iC, oC] always
+    auto weightsShapeInfo = inputShape->at(1);                                  // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto biasShapeInfo    = block.width() > 2 ? inputShape->at(2) : nullptr;    // [oC]

    //output [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
@ -86,6 +87,7 @@ DECLARE_SHAPE_FN(conv2d) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(shape::sizeAt(weightsShapeInfo, 0)); // filter(kernel) height
    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(shape::sizeAt(weightsShapeInfo, 1)); // filter(kernel) width
@ -95,7 +97,7 @@ DECLARE_SHAPE_FN(conv2d) {
    REQUIRE_TRUE(inputShapeInfo[0]   == rank, 0, "CUSTOM CONV2D OP: rank of input array must be equal to %i, but got %i instead !", rank, inputShapeInfo[0]);
    REQUIRE_TRUE(weightsShapeInfo[0] == rank, 0, "CUSTOM CONV2D OP: rank of weights array must be equal to %i, but got %i instead !", rank, weightsShapeInfo[0]);

-    int indIOioC, indIiH, indWoC(3);
+    int indIOioC, indIiH, indWoC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -109,7 +111,7 @@ DECLARE_SHAPE_FN(conv2d) {
    const int iC = inputShapeInfo[indIOioC+1];                   // input channels
    const int oC = weightsShapeInfo[indWoC+1];                   // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(biasShapeInfo[0] <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM CONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
@ -157,13 +159,13 @@ DECLARE_SHAPE_FN(conv2d) {
 CUSTOM_OP_IMPL(conv2d_bp, 3, 2, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next

-    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, oC] always
-    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+    auto gradI = OUTPUT_NULLIFIED(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradW = OUTPUT_NULLIFIED(1);                                                 // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+    auto gradB = block.width() > 3 ? OUTPUT_NULLIFIED(2) : nullptr;                   // [oC]

    int kH = INT_ARG(0);                                                        // filter(kernel) height
    int kW = INT_ARG(1);                                                        // filter(kernel) width
@ -175,6 +177,7 @@ CUSTOM_OP_IMPL(conv2d_bp, 3, 2, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM CONV2D_BP OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D_BP OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
@ -182,19 +185,19 @@ CUSTOM_OP_IMPL(conv2d_bp, 3, 2, false, 0, 9) {

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

    int trueoH, trueoW;          // true output height, width
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong>expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong>expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong>expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D_BP OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());

-    ConvolutionUtils::conv2dBP(block, input, weights, bias, gradO, gradI, gradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW);
+    ConvolutionUtils::conv2dBP(block, input, weights, bias, gradO, gradI, gradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW,wFormat);

    return Status::OK();
 }
@ -204,7 +207,7 @@ CUSTOM_OP_IMPL(conv2d_bp, 3, 2, false, 0, 9) {
 DECLARE_SHAPE_FN(conv2d_bp) {

    auto inputShapeInfo   = inputShape->at(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, iC, oC] always
+    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto biasShapeInfo    = block.width() > 3 ? inputShape->at(2) : nullptr;                  // [oC]
    auto gradOShapeInfo   = block.width() > 3 ? inputShape->at(3) : inputShape->at(2);        // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next

@ -224,8 +227,9 @@ DECLARE_SHAPE_FN(conv2d_bp) {
    const int dW = INT_ARG(7);                                                        // dilations width
    const int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    const int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+    const int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

-    int indIOioC, indIiH, indOoH, indWoC(3);
+    int indIOioC, indIiH, indOoH, indWoC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1; indOoH = 1;
    }
@ -243,7 +247,7 @@ DECLARE_SHAPE_FN(conv2d_bp) {
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(gradOShapeInfo, expectedGradOShape), 0,  "CUSTOM CONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
@ -264,10 +268,10 @@ DECLARE_SHAPE_FN(conv2d_bp) {
 CUSTOM_OP_IMPL(conv2d_input_bp, 3, 1, false, 0, 9) {

    auto gradIShape = INPUT_VARIABLE(0);                                                // [4]
-    auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
+    auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto gradO      = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next

-    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradI = OUTPUT_NULLIFIED(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon

    int kH = INT_ARG(0);                                                        // filter(kernel) height
    int kW = INT_ARG(1);                                                        // filter(kernel) width
@ -279,6 +283,7 @@ CUSTOM_OP_IMPL(conv2d_input_bp, 3, 1, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

    const int rank = gradO->rankOf();

@ -295,17 +300,17 @@ CUSTOM_OP_IMPL(conv2d_input_bp, 3, 1, false, 0, 9) {

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

    int trueoH, trueoW;          // true output height, width
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV2D_INPUT_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D_INPUT_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());

-    ConvolutionUtils::conv2dBP(block, &input, weights, nullptr, gradO, gradI, nullptr, nullptr, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW);
+    ConvolutionUtils::conv2dBP(block, &input, weights, nullptr, gradO, gradI, nullptr, nullptr, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW,wFormat);

    return Status::OK();
 }
@ -321,7 +326,7 @@ CUSTOM_OP_IMPL(conv2d_input_bp, 3, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(conv2d_input_bp) {

    auto gradIShapeShapeInfo = inputShape->at(0);                                                // [4]
-    auto weightsShapeInfo    = inputShape->at(1);                                                // [kH, kW, iC, oC] always
+    auto weightsShapeInfo    = inputShape->at(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto gradOShapeInfo      = inputShape->at(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next

    const int rank = 4;
@ -340,8 +345,9 @@ DECLARE_SHAPE_FN(conv2d_input_bp) {
    const int dW = INT_ARG(7);                                                        // dilations width
    const int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    const int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 0-NCHW, 1-NHWC
+    const int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

-    int indIOioC, indIiH, indWoC(3), indOoH;
+    int indIOioC, indIiH, indWoC(0 == wFormat ? 3 : 0), indOoH;
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1; indOoH = 1;
    }
@ -361,7 +367,7 @@ DECLARE_SHAPE_FN(conv2d_input_bp) {
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(gradOShapeInfo, expectedGradOShape), 0,  "CUSTOM CONV2D_INPUT_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV2D_INPUT_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());

--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@ -32,7 +32,7 @@ namespace ops  {

 CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, iC, oC], [oC, iC, kD, kH, kW], [oC, kD, kH, kW, iC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)

@ -52,14 +52,15 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
    int dH = INT_ARG(10);                                                       // dilations height
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                              // 0-SAME,  1-VALID
-    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;        // 0-[kD, kH, kW, iC, oC], 1-[oC, iC, kD, kH, kW], 2-[oC, kD, kH, kW, iC]

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, wFormat, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);

    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, iC, oC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV3D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV3D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
@ -71,14 +72,24 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
    std::vector<int> permutForOutput;

    if (isNCDHW)
-        permutForOutput    = {0,2,3,4,1};                                        // [bS, oC, oD, oH, oW] -> [bS, oD, oH, oW, oC]
+        permutForOutput = {0,2,3,4,1};                                        // [bS, oC, oD, oH, oW] -> [bS, oD, oH, oW, oC]
    else
        input = new NDArray(input->permute({0,4,1,2,3}));

+    std::vector<int> wAxes;
+    if(0 == wFormat)
+        wAxes = {3,0,1,2};
+    else if(1 == wFormat)
+        wAxes = {1,2,3,4};
+    else
+        wAxes = {4,1,2,3};
+
    NDArray columns(input->ordering(), {bS, iC, kD, kH, kW, oD, oH, oW}, input->dataType(), block.launchContext());
    ConvolutionUtils::vol2col(block, *input, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);                 // [bS, iC, iD, iH, iW] is convoluted to [bS, iC, kD, kH, kW, oD, oH, oW]
    // [bS, iC, kD, kH, kW, oD, oH, oW] x [kD, kH, kW, iC, oC] = [bS, oD, oH, oW, oC]
-    MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, {3,0,1,2}, permutForOutput);
+    // [bS, iC, kD, kH, kW, oD, oH, oW] x [oC, iC, kD, kH, kW] = [bS, oD, oH, oW, oC]
+    // [bS, iC, kD, kH, kW, oD, oH, oW] x [oC, kD, kH, kW, iC] = [bS, oD, oH, oW, oC]
+    MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, wAxes, permutForOutput);

    if(bias)
        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
@ -101,7 +112,7 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
 DECLARE_SHAPE_FN(conv3dnew) {

    auto inputShapeInfo   = inputShape->at(0);                                  // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weightsShapeInfo = inputShape->at(1);                                  // [kD, kH, kW, iC, oC] always
+    auto weightsShapeInfo = inputShape->at(1);                                  // [kD, kH, kW, iC, oC], [oC, iC, kD, kH, kW], [oC, kD, kH, kW, iC]
    auto biasShapeInfo    = block.width() > 2 ? inputShape->at(2) : nullptr;    // [oC]

    int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(shape::sizeAt(weightsShapeInfo, 0));// filter(kernel) depth
@ -118,13 +129,14 @@ DECLARE_SHAPE_FN(conv3dnew) {
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID;
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0-[kD, kH, kW, iC, oC], 1-[oC, iC, kD, kH, kW], 2-[oC, kD, kH, kW, iC]

    const int rank = 5;
    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
    REQUIRE_TRUE(inputShapeInfo[0]   == rank, 0, "CUSTOM CONV3D OP: rank of input array must be equal to %i, but got %i instead !", rank, inputShapeInfo);
    REQUIRE_TRUE(weightsShapeInfo[0] == rank, 0, "CUSTOM CONV3D OP: rank of weights array must be equal to %i, but got %i instead !", rank, weightsShapeInfo);

-    int indIOioC, indIiD, indWoC(4);
+    int indIOioC, indIiD, indWoC(0 == wFormat ? 4 : 0);
    if(!isNCDHW) {
        indIOioC = 4; indIiD = 1;
    }
@ -139,7 +151,7 @@ DECLARE_SHAPE_FN(conv3dnew) {
    int iC = inputShapeInfo[indIOioC+1];                  // input channels
    int oC = weightsShapeInfo[indWoC+1];                  // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV3D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(biasShapeInfo[0] <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM CONV3D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
@ -174,12 +186,12 @@ DECLARE_SHAPE_FN(conv3dnew) {
 CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, iC, oC], [oC, iC, kD, kH, kW], [oC, kD, kH, kW, iC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next

    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, iC, oC] always
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, iC, oC], [oC, iC, kD, kH, kW], [oC, kD, kH, kW, iC]
    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]

    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CUSTOM CONV3D_BP OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
@ -200,17 +212,18 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                              // 1-SAME,  0-VALID
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0-[kD, kH, kW, iC, oC], 1-[oC, iC, kD, kH, kW], 2-[oC, kD, kH, kW, iC]

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, wFormat, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);

    int trueoD, trueoH, trueoW;          // true output depth/height/width
    ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);

    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D_BP OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, iC, oC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV3D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
@ -231,10 +244,25 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
        gradOaxesForDot  = {0,2,3,4};                                           // bS, oD, oH, oW
    }

+    std::vector<int> wPermut, colPermut;
+
+    if(0 == wFormat) {
+        wPermut   = {3,0,1,2,4};
+        colPermut = {2,3,4,1,0,5,6,7};
+    }
+    else if(1 == wFormat) {
+        wPermut   = {1,2,3,4,0};
+        colPermut = {1,2,3,4,0,5,6,7};
+    }
+    else {
+        wPermut   = {4,1,2,3,0};
+        colPermut = {2,3,4,1,0,5,6,7};
+    }
+
    // ----- calculation of gradW and gradB ----- //
    NDArray columns(input->ordering(), {bS, iC, kD, kH, kW, oD, oH, oW}, input->dataType(), block.launchContext());
    ConvolutionUtils::vol2col(block, *input, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // [bS, iC, iD, iH, iW] is convoluted to [bS, iC, kD, kH, kW, oD, oH, oW]
-    MmulHelper::tensorDot(&columns, gradO, gradW, {0,5,6,7}, gradOaxesForDot, {3,0,1,2,4});     // [bS, iC, kD, kH, kW, oD, oH, oW] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [iC, kD, kH, kW, oC]
+    MmulHelper::tensorDot(&columns, gradO, gradW, {0,5,6,7}, gradOaxesForDot, wPermut);     // [bS, iC, kD, kH, kW, oD, oH, oW] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [iC, kD, kH, kW, oC]

    //----- calculation of gradO -----//
    if(gradB) {
@ -246,7 +274,10 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
    }

    //----- calculation of gradI -----//
-    MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2,3,4,1,0,5,6,7});   // [kD, kH, kW, iC, oC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
+    // [kD, kH, kW, iC, oC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
+    // [oC, iC, kD, kH, kW] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
+    // [oC, kD, kH, kW, iC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
+    MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, colPermut);
    ConvolutionUtils::col2vol(block, columns, *gradI, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // columns [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to  [bS, iC, iD, iH, iW]

    if(!isNCDHW) {
@ -270,7 +301,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
 DECLARE_SHAPE_FN(conv3dnew_bp) {

    Nd4jLong* inputShapeInfo   = inputShape->at(0);                                              // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    Nd4jLong* weightsShapeInfo = inputShape->at(1);                                              // [kD, kH, kW, iC, oC] always
+    Nd4jLong* weightsShapeInfo = inputShape->at(1);                                              // [kD, kH, kW, iC, oC], [oC, iC, kD, kH, kW], [oC, kD, kH, kW, iC]
    Nd4jLong* biasShapeInfo    = block.width() > 3 ? inputShape->at(2) : nullptr;                // [oC]
    Nd4jLong* gradOShapeInfo   = block.width() > 3 ? inputShape->at(3) : inputShape->at(2);      // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next

@ -288,6 +319,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
    int dW = INT_ARG(11);                                                       // dilations width
    int paddingMode = INT_ARG(12);                                               // 1-SAME,  0-VALID
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0-[kD, kH, kW, iC, oC], 1-[oC, iC, kD, kH, kW], 2-[oC, kD, kH, kW, iC]

    const int rank = 5;
    REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
@ -295,7 +327,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
    REQUIRE_TRUE(weightsShapeInfo[0] == rank, 0, "CUSTOM CONV3D_BP OP: rank of weights array must be equal to %i, but got %i instead !", rank, weightsShapeInfo);
    REQUIRE_TRUE(gradOShapeInfo[0]   == rank, 0, "CUSTOM CONV3D_BP OP: rank of output gradients (next epsilon) array must be equal to %i, but got %i instead !", rank, gradOShapeInfo);

-    int indIOioC, indIiD, indWoC(4);
+    int indIOioC, indIiD, indWoC(0 == wFormat ? 4 : 0);
    if(!isNCDHW) {
        indIOioC = 4; indIiD = 1;
    }
@ -314,7 +346,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
    ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIiD,indIiD+1,indIiD+2});
-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(gradOShapeInfo, expectedGradOShape),   0, "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "CUSTOM CONV3D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
@ -35,10 +35,10 @@ namespace ops  {
 CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, oC, iC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, oC, iC], [iC, oC, kH, kW], [iC, kH, kW, oC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]

-    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+    auto output  = OUTPUT_NULLIFIED(0);                                   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)

    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM DECONV2D OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM DECONV2D OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
@ -53,12 +53,13 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW     = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, oC, iC], 1 - [iC, oC, kH, kW], 2 - [iC, kH, kW, oC]

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);

-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, oC, iC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
@ -66,6 +67,12 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
    if(!isNCHW)
        output = new NDArray(output->permute({0, 3, 1, 2}));       // [bS, oH, oW, oC] -> [bS, oC, oH, oW]

+    std::vector<int> colPermut;
+    if(1 == wFormat)
+        colPermut = {1, 2, 3, 0, 4, 5};
+    else
+        colPermut = {2, 3, 1, 0, 4, 5};
+
    if(isSameMode)          // Note: we're intentionally swapping iH and oH, to calculated the padding for a"normal" conv (not deconv) forward pass
        ConvolutionUtils::calcPadding2D(pH, pW, iH, iW, oH, oW, kH, kW, sH, sW, dH, dW);

@ -73,8 +80,9 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {

    //----- calculation of output -----//
    // NHWC: [kH, kW, oC, iC] x [bS, iH, iW, iC] = [kH, kW, oC, bS, iH, iW]
-    // NCHW: [kH, kW, oC, iC] x [bS, iC, iH, iW] = [kH, kW, oC, bS, iH, iW]
-    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 1, 0, 4, 5});
+    // NHWC: [iC, oC, kH, kW] x [bS, iH, iW, iC] = [oC, kH, kW, bS, iH, iW]
+    // NHWC: [iC, kH, kW, oC] x [bS, iH, iW, iC] = [kH, kW, oC, bS, iH, iW]
+    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, colPermut);
    LaunchContext* ctx = block.launchContext();
    helpers::col2im(*ctx, columns, *output, sH, sW, pH, pW, oH, oW, dH, dW);     // [bS, oC, kH, kW, iH, iW] is de-convoluted to [bS, oC, oH, oW]

@ -97,7 +105,7 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(deconv2d) {

    auto inputShapeInfo   = inputShape->at(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weightsShapeInfo = inputShape->at(1);                                    // [kH, kW, oC, iC] always
+    auto weightsShapeInfo = inputShape->at(1);                                    // [kH, kW, oC, iC], [iC, oC, kH, kW], [iC, kH, kW, oC]
    auto biasShapeInfo    = block.width() > 2 ? inputShape->at(2) : nullptr;      // [oC]

    const int rank = 4;
@ -114,8 +122,9 @@ DECLARE_SHAPE_FN(deconv2d) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, oC, iC], 1 - [iC, oC, kH, kW], 2 - [iC, kH, kW, oC]

-    int indIOioC, indIiH, indWoC(2);
+    int indIOioC, indIiH, indWoC(0 == wFormat ? 2 : (1 == wFormat ? 1 : 3));
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -129,7 +138,7 @@ DECLARE_SHAPE_FN(deconv2d) {
    const int iC = inputShapeInfo[indIOioC+1];                   // input channels
    const int oC = weightsShapeInfo[indWoC+1];                   // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, oC, iC);
    REQUIRE_TRUE(shape::shapeEquals(4, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(shape::rank(biasShapeInfo) <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM DECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
@ -163,12 +172,12 @@ DECLARE_SHAPE_FN(deconv2d) {
 CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, oC, iC], [iC, oC, kH, kW], [iC, kH, kW, oC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next

    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW), gradI
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, oC, iC] always
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, oC, iC], [iC, oC, kH, kW], [iC, kH, kW, oC]
    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]

    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM DECONV2D_BP OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
@ -186,16 +195,17 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, oC, iC], 1 - [iC, oC, kH, kW], 2 - [iC, kH, kW, oC]

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);

    int trueoH, trueoW;          // true output height, width
    ConvolutionUtils::calcOutSizeDeconv2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, oC, iC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM DECONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DECONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
@ -206,29 +216,34 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
        ConvolutionUtils::calcPadding2D(pH, pW, iH, iW, oH, oW, kH, kW, sH, sW, dH, dW);
    }

-
-     // ----- calculation of gradI -> pass it through conv2d_ff ----- //
+    // ----- calculation of gradI -> pass it through conv2d_ff ----- //
    sd::ops::conv2d conv2d;
-    const Nd4jStatus status = conv2d.execute({gradO, weights}, {gradI}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  isSameMode,  !isNCHW}, {});
+    const Nd4jStatus status = conv2d.execute({gradO, weights}, {gradI}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  isSameMode, !isNCHW, wFormat}, {});
    if (status != ND4J_STATUS_OK)
        return status;

    // -----prepare permutation arrays and axes for dot product ----- //
-    std::vector<int> inputAxesForDot;
+    std::vector<int> inputAxes;

    if(!isNCHW) {
        gradO = new NDArray(gradO->permute({0, 3, 1, 2}));                      // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
-        inputAxesForDot = {0, 1, 2};                                            // bS, iH, iW
+        inputAxes = {0, 1, 2};                                            // bS, iH, iW
    }
    else
-        inputAxesForDot = {0, 2, 3};                                            // bS, iH, iW
+        inputAxes = {0, 2, 3};                                            // bS, iH, iW
+
+    std::vector<int> gradWAxes;     // empty for wFormat = 1
+    if(0 == wFormat)
+        gradWAxes = {3, 2, 0, 1};
+    else if(2 == wFormat)
+        gradWAxes = {0, 3, 1, 2};

    // ----- calculation of gradW ----- //
    NDArray columns(input->ordering(), {bS, oC, kH, kW, iH, iW}, input->dataType(), block.launchContext());

    LaunchContext* ctx = block.launchContext();
    helpers::im2col(*ctx, *gradO, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, oC, oH, oW] is convoluted to [bS, oC, kH, kW, iH, iW]
-    MmulHelper::tensorDot(input, &columns, gradW, inputAxesForDot, {0, 4, 5}, {3, 2, 0, 1});     // [bS, iC, iH, iW]/[bS, iH, iW, iC] x [bS, oC, kH, kW, iH, iW] = [iC, oC, kH, kW]
+    MmulHelper::tensorDot(input, &columns, gradW, inputAxes, {0, 4, 5}, gradWAxes);     // [bS, iC, iH, iW]/[bS, iH, iW, iC] x [bS, oC, kH, kW, iH, iW] = [iC, oC, kH, kW]

    // ----- calculation of gradB ----- //
    if(gradB) {
@ -248,7 +263,7 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
 DECLARE_SHAPE_FN(deconv2d_bp) {

    auto inputShapeInfo   = inputShape->at(0);                                                // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
-    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, oC, iC] always
+    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, oC, iC], [iC, oC, kH, kW], [iC, kH, kW, oC]
    Nd4jLong* biasShapeInfo    = block.width() > 3 ? inputShape->at(2) : nullptr;             // [oC]
    Nd4jLong* gradOShapeInfo   = block.width() > 3 ? inputShape->at(3) : inputShape->at(2);   // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next

@ -267,8 +282,9 @@ DECLARE_SHAPE_FN(deconv2d_bp) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, oC, iC], 1 - [iC, oC, kH, kW], 2 - [iC, kH, kW, oC]

-    int indIOioC, indIiH, indWoC(2), indOoH;
+    int indIOioC, indIiH, indOoH, indWoC(0 == wFormat ? 2 : (1 == wFormat ? 1 : 3));
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1; indOoH = 1;
    }
@ -286,7 +302,7 @@ DECLARE_SHAPE_FN(deconv2d_bp) {
    ConvolutionUtils::calcOutSizeDeconv2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, oC, iC);
    REQUIRE_TRUE(shape::shapeEquals(4, expectedGradOShape.data(), shape::rank(gradOShapeInfo), shape::shapeOf(gradOShapeInfo)), 0,  "CUSTOM DECONV2D_BP OP: wrong shape of output gradients next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(shape::shapeEquals(4, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DECONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp
@ -32,10 +32,10 @@ namespace ops  {
 CUSTOM_OP_IMPL(deconv2d_tf, 3, 1, false, 0, 9) {

    auto gradO      = INPUT_VARIABLE(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC] always
+    auto weights    = INPUT_VARIABLE(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto gradIShape = INPUT_VARIABLE(0);                                                // [4] - shape of input of conv2d (that is shape of gradI)

-    auto gradI = OUTPUT_VARIABLE(0);                                                  // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradI = OUTPUT_NULLIFIED(0);                                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon

    int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
    int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
@ -47,6 +47,7 @@ CUSTOM_OP_IMPL(deconv2d_tf, 3, 1, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

    const int rank = gradO->rankOf();

@ -57,20 +58,19 @@ CUSTOM_OP_IMPL(deconv2d_tf, 3, 1, false, 0, 9) {
    // create empty conv2d input array
    NDArray input(gradO->ordering(), gradIShape->asVectorT<Nd4jLong>(), gradO->dataType(), block.launchContext());

-
    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

    int trueoH, trueoW;          // true output height, width
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM DECONV2D_TF OP: wrong shape of input array, basing on array with output shape expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DECONV2D_TF OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());

-    ConvolutionUtils::conv2dBP(block, &input, weights, nullptr, gradO, gradI, nullptr, nullptr, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW);
+    ConvolutionUtils::conv2dBP(block, &input, weights, nullptr, gradO, gradI, nullptr, nullptr, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW,wFormat);

    return Status::OK();
 }
@ -84,7 +84,7 @@ CUSTOM_OP_IMPL(deconv2d_tf, 3, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(deconv2d_tf) {

    auto gradOShapeInfo   = inputShape->at(2);                                                // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, iC, oC] always
+    auto weightsShapeInfo = inputShape->at(1);                                                // [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
    auto gradIShapeShapeInfo = inputShape->at(0);                                             // [4]

    const int rank = 4;
@ -103,8 +103,9 @@ DECLARE_SHAPE_FN(deconv2d_tf) {
    const int dW = INT_ARG(7);                                                        // dilations width
    const int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    const int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    const int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, oC], 1 - [oC, iC, kH, kW], 2 - [oC, kH, kW, iC]

-    int indIOioC, indIiH, indWoC(3), indOoH;
+    int indIOioC, indIiH, indWoC(0 == wFormat ? 3 : 0), indOoH;
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1; indOoH = 1;
    }
@ -126,7 +127,7 @@ DECLARE_SHAPE_FN(deconv2d_tf) {
    ConvolutionUtils::calcOutSizeDeconv2D(trueiH, trueiW, kH, kW, sH, sW, pH, pW, dH, dW, oH, oW, isSameMode);

    std::vector<Nd4jLong> expectedGradIShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,trueiH,trueiW,  0,indIOioC,indIiH,indIiH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, oC);
    REQUIRE_TRUE(expectedGradIShape == gradIShape, 0,  "CUSTOM DECONV2D_TF OP: wrong shape of array with output shape, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradIShape).c_str(), ShapeUtils::shapeAsString(gradIShape).c_str());
    REQUIRE_TRUE(shape::shapeEquals(4, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DECONV2D_TF OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());

--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
@ -32,7 +32,7 @@ namespace ops  {
 CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, oC, iC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kD, kH, kW, oC, iC], [iC, oC, kD, kH, kW], [iC, kD, kH, kW, oC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]

    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)
@ -53,13 +53,14 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
    int dH = INT_ARG(10);                                                           // dilations height
    int dW = INT_ARG(11);                                                           // dilations width
    int isSameMode = INT_ARG(12);                                                   // 0-SAME,  1-VALID
-    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;           // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;            // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;             // 0 - [kD, kH, kW, oC, iC], 1 - [iC, oC, kD, kH, kW], 2 - [iC, kD, kH, kW, oC]

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, wFormat, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);

-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, oC, iC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DECONV3D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DECONV3D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
@ -67,16 +68,23 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
    if(!isNCDHW)
        output = new NDArray(output->permute({0, 4, 1, 2, 3}));                 // [bS, oD, oH, oW, oC] -> [bS, oC, oD, oH, oW]

+    std::vector<int> colPermut;
+    if(1 == wFormat)
+        colPermut = {1,2,3,4,0,5,6,7};
+    else
+        colPermut = {2,3,4,1,0,5,6,7};
+
    if(isSameMode)         // Note: we're intentionally swapping iH and oH, to calculated the padding for a"normal" conv (not deconv) forward pass
        ConvolutionUtils::calcPadding3D(pD, pH, pW, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, dD, dH, dW);

    NDArray columns(input->ordering(), {bS, oC, kD, kH, kW, iD, iH, iW}, input->dataType(),  block.launchContext());

    //----- calculation of output -----//
-    // NDHWC: [kD, kH, kW, oC, iC] x [bS, iD, iH, iW, iC] = [kD, kH, kW, oC, bS, iD, iH, iW]
-    // NCDHW: [kD, kH, kW, oC, iC] x [bS, iC, iD, iH, iW] = [kD, kH, kW, oC, bS, iD, iH, iW]
-    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 4, 1, 0, 5, 6, 7});   // [bS, oC, kD, kH, kW, iD, iH, iW] -> [kD, kH, kW, oC, bS, iD, iH, iW]
-    ConvolutionUtils::col2vol(block, columns, *output, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // [bS, oC, kD, kH, kW, iD, iH, iW] is de-convoluted to [bS, oC, oD, oH, oW]
+    // [kD, kH, kW, oC, iC] x [bS, iD, iH, iW, iC] = [kD, kH, kW, oC, bS, iD, iH, iW]
+    // [iC, oC, kD, kH, kW] x [bS, iD, iH, iW, iC] = [oC, kD, kH, kW, bS, iD, iH, iW]
+    // [iC, kD, kH, kW, oC] x [bS, iD, iH, iW, iC] = [kD, kH, kW, oC, bS, iD, iH, iW]
+    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, colPermut);       // [bS, oC, kD, kH, kW, iD, iH, iW] -> [kD, kH, kW, oC, bS, iD, iH, iW]
+    ConvolutionUtils::col2vol(block, columns, *output, sD, sH, sW, pD, pH, pW, dD, dH, dW);     // [bS, oC, kD, kH, kW, iD, iH, iW] is de-convoluted to [bS, oC, oD, oH, oW]

    //----- add biases if required -----//
    if(bias)
@ -101,7 +109,7 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
 DECLARE_SHAPE_FN(deconv3d) {

    auto inputShapeInfo   = inputShape->at(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NDCHW)
-    auto weightsShapeInfo = inputShape->at(1);                                    // [kD, kH, kW, oC, iC] always
+    auto weightsShapeInfo = inputShape->at(1);                                    // [kD, kH, kW, oC, iC], [iC, oC, kD, kH, kW], [iC, kD, kH, kW, oC]
    auto biasShapeInfo    = block.width() > 2 ? inputShape->at(2) : nullptr;      // [oC]

    const int rank = 5;
@ -122,8 +130,9 @@ DECLARE_SHAPE_FN(deconv3d) {
    int dW = INT_ARG(11);                                                       // dilations width
    int isSameMode = INT_ARG(12);                                               // 0-SAME,  1-VALID
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0 - [kD, kH, kW, oC, iC], 1 - [iC, oC, kD, kH, kW], 2 - [iC, kD, kH, kW, oC]

-    int indIOioC, indIiD, indWoC(3);
+    int indIOioC, indIiD, indWoC(0 == wFormat ? 3 : (1 == wFormat ? 1 : 4));
    if(!isNCDHW) {
        indIOioC = 4; indIiD = 1;
    }
@ -138,7 +147,7 @@ DECLARE_SHAPE_FN(deconv3d) {
    const int iC = inputShapeInfo[indIOioC+1];                  // input channels
    const int oC = weightsShapeInfo[indWoC+1];                  // output channels

-    std::vector<Nd4jLong>  expectedWeightsShape = {kD, kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, oC, iC);
    REQUIRE_TRUE(shape::shapeEquals(5, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DECONV3D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(shape::rank(biasShapeInfo) <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM DECONV3D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, shape::rank(biasShapeInfo), shape::length(biasShapeInfo));
@ -174,12 +183,12 @@ DECLARE_SHAPE_FN(deconv3d) {
 CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kD, kH, kW, oC, iC], [iC, oC, kD, kH, kW], [iC, kD, kH, kW, oC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next

    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), gradI
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, oC, iC] always
+    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kD, kH, kW, oC, iC], [iC, oC, kD, kH, kW], [iC, kD, kH, kW, oC]
    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]

    REQUIRE_TRUE(input->rankOf()   == 5, 0, "CUSTOM DECONV3D_BP OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
@ -201,16 +210,17 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
    int dW = INT_ARG(11);                                                       // dilations width
    int isSameMode = INT_ARG(12);                                               // 0-SAME,  1-VALID
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0 - [kD, kH, kW, oC, iC], 1 - [iC, oC, kD, kH, kW], 2 - [iC, kD, kH, kW, oC]

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, wFormat, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);

    int trueoD, trueoH, trueoW;          // true output height, width
    ConvolutionUtils::calcOutSizeDeconv3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, oC, iC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, oC, iC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM DECONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DECONV3D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
@ -221,7 +231,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {

     // ----- calculation of gradI -> pass it through conv3d_ff ----- //
    sd::ops::conv3dnew conv3d;
-    const Nd4jStatus status = conv3d.execute({gradO, weights}, {gradI}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  isSameMode,  !isNCDHW}, {});
+    const Nd4jStatus status = conv3d.execute({gradO, weights}, {gradI}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  isSameMode,  !isNCDHW, wFormat}, {});
    if (status != ND4J_STATUS_OK)
        return status;

@ -235,10 +245,16 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
    else
        inputAxesForDot = {0, 2, 3, 4};                                         // bS, iD, iH, iW

+    std::vector<int> gradWAxes;     // empty for wFormat = 1
+    if(0 == wFormat)
+        gradWAxes = {4,3,0,1,2};
+    else if(2 == wFormat)
+        gradWAxes = {0,4,1,2,3};
+
    // ----- calculation of gradW ----- //
    auto columns = NDArrayFactory::create(input->ordering(), {bS, oC, kD, kH, kW, iD, iH, iW},  input->dataType(), block.launchContext());
-    ConvolutionUtils::vol2col(block, *gradO, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);                  // [bS, oC, oD, oH, oW] is deconvoluted to [bS, oC, kD, kH, kW, iD, iH, iW]
-    MmulHelper::tensorDot(input, &columns, gradW, inputAxesForDot, {0, 5, 6, 7}, {4, 3, 0, 1, 2});   // [bS, iC, iD, iH, iW]/[bS, iD, iH, iW, iC] x [bS, oC, kD, kH, kW, iD, iH, iW] = [iC, oC, kD, kH, kW]
+    ConvolutionUtils::vol2col(block, *gradO, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);     // [bS, oC, oD, oH, oW] is deconvoluted to [bS, oC, kD, kH, kW, iD, iH, iW]
+    MmulHelper::tensorDot(input, &columns, gradW, inputAxesForDot, {0, 5, 6, 7}, gradWAxes);   // [bS, iC, iD, iH, iW]/[bS, iD, iH, iW, iC] x [bS, oC, kD, kH, kW, iD, iH, iW] = [iC, oC, kD, kH, kW]

    // ----- calculation of gradB ----- //
    if(gradB) {
@ -267,7 +283,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
 DECLARE_SHAPE_FN(deconv3d_bp) {

    auto inputShapeInfo   = inputShape->at(0);                                                // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto weightsShapeInfo = inputShape->at(1);                                                // [kD, kH, kW, oC, iC] always
+    auto weightsShapeInfo = inputShape->at(1);                                                // [kD, kH, kW, oC, iC], [iC, oC, kD, kH, kW], [iC, kD, kH, kW, oC]
    Nd4jLong* biasShapeInfo    = block.width() > 3 ? inputShape->at(2) : nullptr;             // [oC]
    Nd4jLong* gradOShapeInfo   = block.width() > 3 ? inputShape->at(3) : inputShape->at(2);   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next

@ -290,8 +306,9 @@ DECLARE_SHAPE_FN(deconv3d_bp) {
    int dW = INT_ARG(11);                                                       // dilations width
    int isSameMode = INT_ARG(12);                                               // 0-SAME,  1-VALID
    int isNCDHW  = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1;       // INT_ARG(13): 1-NDHWC, 0-NCDHW
+    int wFormat = block.getIArguments()->size() > 14 ? INT_ARG(14) : 0;         // 0 - [kD, kH, kW, oC, iC], 1 - [iC, oC, kD, kH, kW], 2 - [iC, kD, kH, kW, oC]

-    int indIOioC, indIiD, indWoC(3);
+    int indIOioC, indIiD, indWoC(0 == wFormat ? 3 : (1 == wFormat ? 1 : 4));
    if(!isNCDHW) {
        indIOioC = 4; indIiD = 1;
    }
@ -310,8 +327,8 @@ DECLARE_SHAPE_FN(deconv3d_bp) {
    ConvolutionUtils::calcOutSizeDeconv3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW,  0,indIOioC,indIiD,indIiD+1,indIiD+2});
-    std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, oC, iC};
-    REQUIRE_TRUE(shape::shapeEquals(5, expectedGradOShape.data(), shape::rank(gradOShapeInfo), shape::shapeOf(gradOShapeInfo)), 0,  "CUSTOM DECONV3D_BP OP: wrong shape of output gradients next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kD, kH, kW, oC, iC);
+    REQUIRE_TRUE(shape::shapeEquals(5, expectedGradOShape.data(), shape::rank(gradOShapeInfo), shape::shapeOf(gradOShapeInfo)), 0,  "CUSTOM DECONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(shape::shapeEquals(5, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DECONV3D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
        REQUIRE_TRUE(biasShapeInfo[0] <= 2 && oC == shape::length(biasShapeInfo), 0, "CUSTOM DECONV3D_BP OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
--- a/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp
@ -32,10 +32,10 @@ namespace ops  {
 CUSTOM_OP_IMPL(depthwise_conv2d, 2, 1, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC] = iC*mC

-    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
+    auto output  = OUTPUT_NULLIFIED(0);                                   // [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)

    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM DEPTHWISECONV2D OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM DEPTHWISECONV2D OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
@ -50,19 +50,20 @@ CUSTOM_OP_IMPL(depthwise_conv2d, 2, 1, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW     = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
    mC = weights->sizeAt(indWmC);                           // channels multiplier

-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DEPTHWISECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !",  ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    REQUIRE_TRUE(output->sizeAt(indIOioC) == iC*mC, 0, "CUSTOM DEPTHWISECONV2D OP: the output_channels must be equal to input_channels * channels_multiplier = %i !", iC*mC);
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DEPTHWISECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());

-    ConvolutionUtils::depthwiseConv2d(block, input, weights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW);
+    ConvolutionUtils::depthwiseConv2d(block, input, weights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW,isSameMode,isNCHW,wFormat);

    return Status::OK();
 }
@ -75,7 +76,7 @@ CUSTOM_OP_IMPL(depthwise_conv2d, 2, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(depthwise_conv2d) {

    Nd4jLong* inputShapeInfo   = inputShape->at(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    Nd4jLong* weightsShapeInfo = inputShape->at(1);                                    // [kH, kW, iC, mC] always
+    Nd4jLong* weightsShapeInfo = inputShape->at(1);                                    // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
    Nd4jLong* biasShapeInfo    = block.width() > 2 ? inputShape->at(2) : nullptr;      // [oC] = iC*mC

    const int rank = 4;
@ -92,8 +93,9 @@ DECLARE_SHAPE_FN(depthwise_conv2d) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

-    int indIOioC, indIiH, indWmC(3);
+    int indIOioC, indIiH, indWmC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -109,7 +111,7 @@ DECLARE_SHAPE_FN(depthwise_conv2d) {
    const int oC = iC*mC;                                       // output channels


-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(shape::shapeEquals(4, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "DEPTHWISECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(shape::rank(biasShapeInfo) <= 2 && oC == shape::length(biasShapeInfo), 0, "DEPTHWISECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, shape::rank(biasShapeInfo), shape::length(biasShapeInfo));
@ -148,13 +150,13 @@ DECLARE_SHAPE_FN(depthwise_conv2d) {
 CUSTOM_OP_IMPL(depthwise_conv2d_bp, 3, 2, false, 0, 9) {

    auto input   = INPUT_VARIABLE(0);                                                // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
-    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
+    auto weights = INPUT_VARIABLE(1);                                                // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
    auto bias    = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr;                  // [oC] = [iC*mC]
    auto gradO   = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2);        // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next

-    auto gradI = OUTPUT_VARIABLE(0);                                                 // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
-    auto gradW = OUTPUT_VARIABLE(1);                                                 // [kH, kW, iC, mC] always
-    auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
+    auto gradI = OUTPUT_NULLIFIED(0);                                                 // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
+    auto gradW = OUTPUT_NULLIFIED(1);                                                 // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    auto gradB = block.width() > 3 ? OUTPUT_NULLIFIED(2) : nullptr;                   // [oC]

    REQUIRE_TRUE(input->rankOf()   == 4, 0, "CUSTOM DEPTHWISECONV2D_BP OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
    REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM DEPTHWISECONV2D_BP OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
@ -170,23 +172,24 @@ CUSTOM_OP_IMPL(depthwise_conv2d_bp, 3, 2, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
    mC = weights->sizeAt(indWmC);                           // channels multiplier

    int trueoH, trueoW;          // correct output height, width
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong> expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indOoH,indOoH+1});
-    std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0,  "CUSTOM DEPTHWISECONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM DEPTHWISECONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if(bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM DEPTHWISECONV2D_BP OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());

-    ConvolutionUtils::depthwiseConv2dBP(block, input, weights, bias, gradO, gradI, gradW, gradB, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW);
+    ConvolutionUtils::depthwiseConv2dBP(block, input, weights, bias, gradO, gradI, gradW, gradB, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW, wFormat);

    return Status::OK();
 }
@ -214,8 +217,9 @@ DECLARE_SHAPE_FN(depthwise_conv2d_bp) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

-    int indIOioC, indIiH, indWmC(3);
+    int indIOioC, indIiH, indWmC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -234,7 +238,7 @@ DECLARE_SHAPE_FN(depthwise_conv2d_bp) {
    ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode);

    std::vector<Nd4jLong>  expectedGradOShape   = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indIiH,indIiH+1});
-    std::vector<Nd4jLong>  expectedWeightsShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong>  expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(shape::shapeEquals(4, expectedGradOShape.data(), shape::rank(gradOShapeInfo), shape::shapeOf(gradOShapeInfo)), 0,  "CUSTOM DEPTHWISECONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
    REQUIRE_TRUE(shape::shapeEquals(4, expectedWeightsShape.data(), shape::rank(weightsShapeInfo), shape::shapeOf(weightsShapeInfo)), 0, "CUSTOM DEPTHWISECONV2D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if(biasShapeInfo)
--- a/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp
@ -30,8 +30,7 @@ namespace sd {
    namespace ops {
        CUSTOM_OP_IMPL(im2col, 1, 1, false, 0, 9) {
            auto x = INPUT_VARIABLE(0);
-            auto z = OUTPUT_VARIABLE(0);
-
+            auto z = OUTPUT_NULLIFIED(0);

            REQUIRE_TRUE(x->rankOf() == 4, 0, "im2col input should be 4D, but got %i instead", x->rankOf());
            REQUIRE_TRUE(z->rankOf() == 6, 0, "im2col output should be 6D, but got %i instead", z->rankOf());
@ -53,8 +52,6 @@ namespace sd {
            LaunchContext* ctx = block.launchContext();
            sd::ops::helpers::im2col(*ctx, *x, *z, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, NDArrayFactory::create(zeroPadVal, block.launchContext()));

-            STORE_RESULT(*z);
-
            return Status::OK();
        }

@ -107,7 +104,7 @@ namespace sd {
 		CUSTOM_OP_IMPL(im2col_bp, 2, 1, false, 0, 9) {
            auto input = INPUT_VARIABLE(0);
 			auto gradAtOutput = INPUT_VARIABLE(1);
-            auto z = OUTPUT_VARIABLE(0);
+            auto z = OUTPUT_NULLIFIED(0);

            REQUIRE_TRUE(input->rankOf() == 4, 0, "im2col_bp input should be 4D, but got %i instead", input->rankOf());
 			REQUIRE_TRUE(gradAtOutput->rankOf() == 6, 0, "im2col_bp gradient at output (input idx 1) should be 6D, but got %i instead", gradAtOutput->rankOf());
--- a/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp
@ -29,7 +29,7 @@ namespace ops  {
 CUSTOM_OP_IMPL(pointwise_conv2d, 2, 1, false, 0, 0) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    auto weights = INPUT_VARIABLE(1);                                    // [1,  1,  iC, oC] always
+    auto weights = INPUT_VARIABLE(1);                                    // [1, 1, iC, oC], [oC, iC, 1, 1], [oC, 1, 1, iC]
    auto bias    = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]

    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, iH, iW, oC] (NHWC) or [bS, oC, iH, iW] (NCHW)
@ -47,18 +47,19 @@ CUSTOM_OP_IMPL(pointwise_conv2d, 2, 1, false, 0, 0) {
    int pW = 0;                                                             // paddings width
    int dH = 1;                                                             // dilations height
    int dW = 1;                                                             // dilations width
-    int isNCHW = block.getIArguments()->size() > 0 ? !INT_ARG(0) : 1;       // INT_ARG(0): 0-NCHW, 1-NHWC
+    int isNCHW  = block.getIArguments()->size() > 0 ? !INT_ARG(0) : 1;      // INT_ARG(0): 0-NCHW, 1-NHWC
+    int wFormat = block.getIArguments()->size() > 1 ? INT_ARG(1) : 0;       // 0 - [1, 1, iC, oC], 1 - [oC, iC, 1, 1], 2 - [oC, 1, 1, iC]

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

-    std::vector<Nd4jLong> expectedWeightsShape = {1, 1, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC, oC);
    REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM POINTWISECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
    if (bias)
        REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM POINTWISECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());

-    ConvolutionUtils::conv2d(block, input, weights, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, 1/*isSameMode*/, isNCHW);
+    ConvolutionUtils::conv2d(block, input, weights, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, 1/*isSameMode*/, isNCHW, wFormat);

    return Status::OK();
 }
@ -73,7 +74,7 @@ CUSTOM_OP_IMPL(pointwise_conv2d, 2, 1, false, 0, 0) {
 DECLARE_SHAPE_FN(pointwise_conv2d) {

    Nd4jLong* inputShapeInfo  = inputShape->at(0);                                   // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    Nd4jLong* weightsShapeInfo  = inputShape->at(1);                                 // [1,  1,  iC, oC] always
+    Nd4jLong* weightsShapeInfo  = inputShape->at(1);                                 // [1, 1, iC, oC], [oC, iC, 1, 1], [oC, 1, 1, iC]
    Nd4jLong* biasShapeInfo = block.width() > 2 ? inputShape->at(2) : nullptr;       // [oC]

    const int rank = 4;
@ -81,8 +82,9 @@ DECLARE_SHAPE_FN(pointwise_conv2d) {
    REQUIRE_TRUE(weightsShapeInfo[0] == rank, 0, "CUSTOM POINTWISECONV2D OP: rank of weights array must be equal to %i, but got %i instead !", rank, weightsShapeInfo[0]);

    int isNCHW = block.getIArguments()->size() > 0 ? !INT_ARG(0) : 1;       // INT_ARG(0): 0-NCHW, 1-NHWC
+    int wFormat = block.getIArguments()->size() > 1 ? INT_ARG(1) : 0;       // 0 - [1, 1, iC, oC], 1 - [oC, iC, 1, 1], 2 - [oC, 1, 1, iC]

-    int indIOioC, indWoC(3);
+    int indIOioC, indWoC(0 == wFormat ? 3 : 0);
    if(!isNCHW)
        indIOioC = 3;
    else
@ -92,7 +94,7 @@ DECLARE_SHAPE_FN(pointwise_conv2d) {
    const int iC = inputShapeInfo[indIOioC+1];                   // input channels
    const int oC = weightsShapeInfo[indWoC+1];                   // output channels

-    std::vector<Nd4jLong> expectedWeightsShape = {1, 1, iC, oC};
+    std::vector<Nd4jLong> expectedWeightsShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC, oC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsShapeInfo, expectedWeightsShape), 0, "POINTWISECONV2D OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weightsShapeInfo).c_str());
    if (biasShapeInfo)
        REQUIRE_TRUE(biasShapeInfo[0] <= 2 && oC == shape::length(biasShapeInfo), 0, "POINTWISECONV2D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, biasShapeInfo[0], shape::length(biasShapeInfo));
--- a/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp
@ -33,11 +33,11 @@ namespace ops  {
 CUSTOM_OP_IMPL(sconv2d, 2, 1, false, 0, 9) {

    NDArray *input        = INPUT_VARIABLE(0);                    // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
-    NDArray *weightsDepth = INPUT_VARIABLE(1);                    // [kH, kW, iC, mC]  always
-    NDArray *weightsPoint = nullptr;                              // [1, 1, iC*mC, oC] always
+    NDArray *weightsDepth = INPUT_VARIABLE(1);                    // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    NDArray *weightsPoint = nullptr;                              // [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
    NDArray *bias         = nullptr;                              // [oC], if weightsPoint=nullptr then oC = iC*mC

-    NDArray *output    = OUTPUT_VARIABLE(0);                      // [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
+    NDArray *output    = OUTPUT_NULLIFIED(0);                      // [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)

    if(block.width() == 3) {
        if((INPUT_VARIABLE(2))->rankOf() == 4)
@ -66,17 +66,19 @@ CUSTOM_OP_IMPL(sconv2d, 2, 1, false, 0, 9) {
    int dH = INT_ARG(6);                                                        // dilations height
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
-    int isNCHW     = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int isNCHW  = block.getIArguments()->size() > 9  ? !INT_ARG(9) : 1;         // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]
+

    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
    mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier

-    std::vector<Nd4jLong> expectedWeightsDShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsDShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(weightsDepth->isSameShape(expectedWeightsDShape), 0, " SCONV2D OP: wrong shape of weightsDepth array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsDShape).c_str(), ShapeUtils::shapeAsString(weightsDepth).c_str());
    if(weightsPoint) {
-        std::vector<Nd4jLong>  expectedWeightsPShape = {1, 1, iC*mC, oC};
+        std::vector<Nd4jLong>  expectedWeightsPShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC*mC, oC);
        REQUIRE_TRUE(weightsPoint->isSameShape(expectedWeightsPShape), 0, " SCONV2D OP: wrong shape of weightsPoint array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsPShape).c_str(), ShapeUtils::shapeAsString(weightsPoint).c_str());
    }
    if (bias)
@ -84,11 +86,11 @@ CUSTOM_OP_IMPL(sconv2d, 2, 1, false, 0, 9) {

    if (iC == 1) {
        nd4j_debug("SCONV2D OP: for input_channels = 1 this op is equivalent to standard conv2d\n","");
-        ConvolutionUtils::conv2d(block, input, weightsDepth, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW);
+        ConvolutionUtils::conv2d(block, input, weightsDepth, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW, wFormat);
        return Status::OK();
    }

-    ConvolutionUtils::sconv2d(block, input, weightsDepth, weightsPoint, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW);
+    ConvolutionUtils::sconv2d(block, input, weightsDepth, weightsPoint, bias, output, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW, wFormat);

    return Status::OK();
 }
@ -103,8 +105,8 @@ CUSTOM_OP_IMPL(sconv2d, 2, 1, false, 0, 9) {
 DECLARE_SHAPE_FN(sconv2d) {

    auto inputShapeInfo    = inputShape->at(0);         // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
-    auto weightsDShapeInfo = inputShape->at(1);         // [kH, kW, iC, mC]  always
-    Nd4jLong* weightsPShapeInfo = nullptr;              // [1, 1, iC*mC, oC] always
+    auto weightsDShapeInfo = inputShape->at(1);         // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    Nd4jLong* weightsPShapeInfo = nullptr;              // [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
    Nd4jLong* biasShapeInfo     = nullptr;              // [oC], oC = iC*mC if weightsPoint=nullptr

    if(block.width() == 3)
@ -135,8 +137,9 @@ DECLARE_SHAPE_FN(sconv2d) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW  = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;          // INT_ARG(9): 1-NHWC, 0-NCHW
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

-    int indIOioC, indIiH, indWmC(3);
+    int indIOioC, indIiH, indWmC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -148,13 +151,13 @@ DECLARE_SHAPE_FN(sconv2d) {
    const int iH = inputShapeInfo[indIiH+1];                                        // input height
    const int iW = inputShapeInfo[indIiH+2];                                        // input width
    const int iC = inputShapeInfo[indIOioC+1];                                      // input channels
-    const int mC = weightsDShapeInfo[indWmC+1];                                      // channel multiplier
-    const int oC = weightsPShapeInfo ? weightsPShapeInfo[indWmC+1] : iC*mC;       // output channels (oC or iC*mC)
+    const int mC = weightsDShapeInfo[indWmC+1];                                     // channel multiplier
+    const int oC = weightsPShapeInfo ? weightsPShapeInfo[indWmC+1] : iC*mC;         // output channels (oC or iC*mC)

-    std::vector<Nd4jLong>  expectedWeightsDShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsDShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsDShapeInfo, expectedWeightsDShape), 0, "SCONV2D OP: wrong shape of depth weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsDShape).c_str(), ShapeUtils::shapeAsString(weightsDShapeInfo).c_str());
    if(weightsPShapeInfo) {
-        std::vector<Nd4jLong> expectedWeightsPShape = {1, 1, iC*mC, oC};
+        std::vector<Nd4jLong> expectedWeightsPShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC*mC, oC);
        REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsPShapeInfo, expectedWeightsPShape), 0, "SCONV2D OP: wrong shape of point array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsPShape).c_str(), ShapeUtils::shapeAsString(weightsPShapeInfo).c_str());
    }
    if (biasShapeInfo)
@ -195,30 +198,30 @@ CUSTOM_OP_IMPL(sconv2d_bp, 3, 2, false, 0, 9) {

    NDArray *input        = INPUT_VARIABLE(0);                                           // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
    NDArray *gradO        = INPUT_VARIABLE(1);                                           // [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    NDArray *weightsDepth = INPUT_VARIABLE(2);                                           // [kH, kW, iC, mC] always
-    NDArray *weightsPoint = nullptr;                                                     // [1, 1, iC*mC, oC] always
+    NDArray *weightsDepth = INPUT_VARIABLE(2);                                           // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    NDArray *weightsPoint = nullptr;                                                     // [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
    NDArray *bias         = nullptr;                                                     // [oC], oC = iC*mC if weightsPoint=nullptr

-    NDArray *gradI  = OUTPUT_VARIABLE(0);                                                // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
-    NDArray *gradWD = OUTPUT_VARIABLE(1);                                                // [kH, kW, iC, mC] always
-    NDArray *gradWP = nullptr;                                                           // [1, 1, iC*mC, oC] always
+    NDArray *gradI  = OUTPUT_NULLIFIED(0);                                                // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    NDArray *gradWD = OUTPUT_NULLIFIED(1);                                                // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    NDArray *gradWP = nullptr;                                                           // [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
    NDArray *gradB  = nullptr;                                                           // [oC]

    if(block.width() == 4) {
        if((INPUT_VARIABLE(3))->rankOf() == 4) {
            weightsPoint = INPUT_VARIABLE(3);
-            gradWP       = OUTPUT_VARIABLE(2);
+            gradWP       = OUTPUT_NULLIFIED(2);
        }
        else {
            bias  = INPUT_VARIABLE(3);
-            gradB = OUTPUT_VARIABLE(2);
+            gradB = OUTPUT_NULLIFIED(2);
        }
    }
    else if(block.width() == 5) {
        weightsPoint = INPUT_VARIABLE(3);
        bias         = INPUT_VARIABLE(4);
-        gradWP       = OUTPUT_VARIABLE(2);
-        gradB        = OUTPUT_VARIABLE(3);
+        gradWP       = OUTPUT_NULLIFIED(2);
+        gradB        = OUTPUT_NULLIFIED(3);
    }


@ -244,17 +247,18 @@ CUSTOM_OP_IMPL(sconv2d_bp, 3, 2, false, 0, 9) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW     = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
    mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier

-    std::vector<Nd4jLong> expectedWeightsDShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsDShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(weightsDepth->isSameShape(expectedWeightsDShape), 0, " SCONV2D_BP OP: wrong shape of weightsDepth array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsDShape).c_str(), ShapeUtils::shapeAsString(weightsDepth).c_str());
    REQUIRE_TRUE(gradWD->isSameShape(expectedWeightsDShape),       0, " SCONV2D_BP OP: wrong shape of gradWD array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsDShape).c_str(), ShapeUtils::shapeAsString(gradWD).c_str());
    if(weightsPoint) {
-        std::vector<Nd4jLong> expectedWeightsPShape = {1, 1, iC*mC, oC};
+        std::vector<Nd4jLong> expectedWeightsPShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC*mC, oC);
        REQUIRE_TRUE(weightsPoint->isSameShape(expectedWeightsPShape), 0, " SCONV2D_BP OP: wrong shape of weightsPoint array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsPShape).c_str(), ShapeUtils::shapeAsString(weightsPoint).c_str());
        REQUIRE_TRUE(gradWP->isSameShape(expectedWeightsPShape),       0, " SCONV2D_BP OP: wrong shape of gradWP array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsPShape).c_str(), ShapeUtils::shapeAsString(gradWP).c_str());
    }
@ -274,12 +278,12 @@ CUSTOM_OP_IMPL(sconv2d_bp, 3, 2, false, 0, 9) {

        auto resultFFShape = isNCHW ? std::vector<Nd4jLong>({bS, mC*iC, oH, oW}) : std::vector<Nd4jLong>({bS, oH, oW, mC*iC});
        auto resultFF  = NDArrayFactory::create_(input->ordering(), resultFFShape, input->dataType(), block.launchContext());
-        ConvolutionUtils::sconv2d(block, input, weightsDepth, nullptr, nullptr, resultFF, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW);
+        ConvolutionUtils::sconv2d(block, input, weightsDepth, nullptr, nullptr, resultFF, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW, wFormat);

        auto gradIDepthShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC*mC,oH,oW,  0,indIOioC,indIiH,indIiH+1});
        auto gradIDepth  = NDArrayFactory::create_(resultFF->ordering(), gradIDepthShape, resultFF->dataType(), block.launchContext());                 // [bS, oH, oW, iC*mC]  (NHWC) or [bS, iC*mC, oH, oW] (NCHW)

-        ConvolutionUtils::conv2dBP(block, resultFF, weightsPoint, bias, gradO, gradIDepth, gradWP, gradB, 1,1, 1,1, 0,0, 1,1, isSameMode, isNCHW);    // in this case oH=iH and oW=iW
+        ConvolutionUtils::conv2dBP(block, resultFF, weightsPoint, bias, gradO, gradIDepth, gradWP, gradB, 1,1, 1,1, 0,0, 1,1, isSameMode, isNCHW, wFormat);    // in this case oH=iH and oW=iW

        gradO = gradIDepth;
        bias = gradB = nullptr;                     // if pointwise backprop was done then don't calculate gradB at depthwise_conv2d_bp step
@ -288,7 +292,7 @@ CUSTOM_OP_IMPL(sconv2d_bp, 3, 2, false, 0, 9) {
    }

    // ----- apply depthwise_conv2d_bp ----- //
-    ConvolutionUtils::depthwiseConv2dBP(block, input, weightsDepth, bias, gradO, gradI, gradWD, gradB, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW);
+    ConvolutionUtils::depthwiseConv2dBP(block, input, weightsDepth, bias, gradO, gradI, gradWD, gradB, kH,kW, sH,sW, pH,pW, dH,dW, isSameMode, isNCHW, wFormat);

    if(weightsPoint)
        delete gradO;
@ -301,8 +305,8 @@ DECLARE_SHAPE_FN(sconv2d_bp) {

    auto inputShapeInfo    = inputShape->at(0);                 // [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
    auto gradOShapeInfo    = inputShape->at(1);                 // [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    auto weightsDShapeInfo = inputShape->at(2);                 // [kH, kW, iC, mC]  always
-    Nd4jLong* weightsPShapeInfo = nullptr;                      // [1, 1, iC*mC, oC] always
+    auto weightsDShapeInfo = inputShape->at(2);                 // [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    Nd4jLong* weightsPShapeInfo = nullptr;                      // [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
    Nd4jLong* biasShapeInfo     = nullptr;                      // [oC], oC = iC*mC if weightsPoint=nullptr

    if(block.width() == 4) {
@ -335,8 +339,9 @@ DECLARE_SHAPE_FN(sconv2d_bp) {
    int dW = INT_ARG(7);                                                        // dilations width
    int isSameMode = INT_ARG(8);                                                // 0-VALID, 1-SAME
    int isNCHW     = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1;       // INT_ARG(9): 0-NCHW,  1-NHWC
+    int wFormat = block.getIArguments()->size() > 10 ? INT_ARG(10) : 0;         // 0 - [kH, kW, iC, mC], 1 - [mC, iC, kH, kW], 2 - [mC, kH, kW, iC]

-    int indIOioC, indIiH, indWmC(3);
+    int indIOioC, indIiH, indWmC(0 == wFormat ? 3 : 0);
    if(!isNCHW) {
        indIOioC = 3; indIiH = 1;
    }
@ -356,10 +361,10 @@ DECLARE_SHAPE_FN(sconv2d_bp) {

    std::vector<Nd4jLong> expectedGradOShapeInfo = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW,  0,indIOioC,indIiH,indIiH+1});
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(gradOShapeInfo, expectedGradOShapeInfo), 0, "SCONV2D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShapeInfo).c_str(), ShapeUtils::shapeAsString(gradOShapeInfo).c_str());
-    std::vector<Nd4jLong> expectedWeightsDShape = {kH, kW, iC, mC};
+    std::vector<Nd4jLong> expectedWeightsDShape = ConvolutionUtils::expectWeightsShape(wFormat, kH, kW, iC, mC);
    REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsDShapeInfo, expectedWeightsDShape), 0, "SCONV2D_BP OP: wrong shape of depth weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsDShape).c_str(), ShapeUtils::shapeAsString(weightsDShapeInfo).c_str());
    if(weightsPShapeInfo) {
-        std::vector<Nd4jLong> expectedWeightsPShape = {1, 1, iC*mC, oC};
+        std::vector<Nd4jLong> expectedWeightsPShape = ConvolutionUtils::expectWeightsShape(wFormat, 1, 1, iC*mC, oC);
        REQUIRE_TRUE(ShapeUtils::areShapesEqual(weightsPShapeInfo, expectedWeightsPShape), 0, "SCONV2D_BP OP: wrong shape of point array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsPShape).c_str(), ShapeUtils::shapeAsString(weightsPShapeInfo).c_str());
    }
    if (biasShapeInfo)
--- a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp
@ -32,7 +32,7 @@ namespace ops  {
 //////////////////////////////////////////////////////////////////////
 CUSTOM_OP_IMPL(upsampling2d, 1, 1, false, 0, 2) {
    auto input  = INPUT_VARIABLE(0);             // [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
-    auto output = OUTPUT_VARIABLE(0);            // [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
+    auto output = OUTPUT_NULLIFIED(0);            // [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)

    const int factorH = INT_ARG(0);
    const int factorW = INT_ARG(1);
@ -97,7 +97,7 @@ CUSTOM_OP_IMPL(upsampling2d_bp, 2, 1, false, 0, 0) {

    // NDArray<T>* input = INPUT_VARIABLE(0);             // [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
    auto gradO = INPUT_VARIABLE(1);             // [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
-    auto gradI = OUTPUT_VARIABLE(0);            // [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
+    auto gradI = OUTPUT_NULLIFIED(0);            // [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)

    const int isNCHW  = block.getIArguments()->size() > 0 ? INT_ARG(0) : 0;       // INT_ARG(0): 0-NCHW,  1-NHWC

--- a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp
@ -31,7 +31,7 @@ namespace ops  {
 //////////////////////////////////////////////////////////////////////
 CUSTOM_OP_IMPL(upsampling3d, 1, 1, false, 0, 3) {
    auto input  = INPUT_VARIABLE(0);             // [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
-    auto output = OUTPUT_VARIABLE(0);            // [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
+    auto output = OUTPUT_NULLIFIED(0);            // [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
            
    const int factorD = INT_ARG(0);
    const int factorH = INT_ARG(1);
@ -97,7 +97,7 @@ DECLARE_SHAPE_FN(upsampling3d) {
 CUSTOM_OP_IMPL(upsampling3d_bp, 2, 1, false, 0, 0) {
    // NDArray<T>* input = INPUT_VARIABLE(0);             // [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
    auto gradO = INPUT_VARIABLE(1);             // [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
-    auto gradI = OUTPUT_VARIABLE(0);            // [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
+    auto gradI = OUTPUT_NULLIFIED(0);            // [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
                
    const int isNCDHW  = block.getIArguments()->size() > 0 ? INT_ARG(0) : 0;       // INT_ARG(0): 0-NCHW,  1-NHWC

--- a/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
--- a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp
@ -31,7 +31,7 @@ namespace ops  {
 CUSTOM_OP_IMPL(avgpool2d, 1, 1, false, 0, 10) {

    auto input = INPUT_VARIABLE(0);
-    auto output = OUTPUT_VARIABLE(0);
+    auto output = OUTPUT_NULLIFIED(0);

    // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;

@ -147,7 +147,7 @@ CUSTOM_OP_IMPL(avgpool2d_bp, 2, 1, false, 0, 10) {

    auto input = INPUT_VARIABLE(0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto gradO = INPUT_VARIABLE(1);                          // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    auto gradI = OUTPUT_VARIABLE(0);                         // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradI = OUTPUT_NULLIFIED(0);                         // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon

    int kH = INT_ARG(0);                                                        // filter(kernel) height
    int kW = INT_ARG(1);                                                        // filter(kernel) width
@ -166,7 +166,7 @@ CUSTOM_OP_IMPL(avgpool2d_bp, 2, 1, false, 0, 10) {

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, 0, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,oH,oW,  0,indIOioC,indIiH,indIiH+1});
    std::vector<Nd4jLong> expectedGradIShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,iH,iW,  0,indIOioC,indIiH,indIiH+1});
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp
@ -32,7 +32,7 @@ namespace ops  {
 CUSTOM_OP_IMPL(avgpool3dnew, 1, 1, false, 0, 14) {

    auto input   = INPUT_VARIABLE(0);                                    // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
-    auto output  = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, iC] (NDHWC) or [bS, iC, oD, oH, oW] (NCDHW)
+    auto output  = OUTPUT_NULLIFIED(0);                                   // [bS, oD, oH, oW, iC] (NDHWC) or [bS, iC, oD, oH, oW] (NCDHW)

    int kD = INT_ARG(0);                                                        // filter(kernel) depth
    int kH = INT_ARG(1);                                                        // filter(kernel) height
@ -55,7 +55,7 @@ CUSTOM_OP_IMPL(avgpool3dnew, 1, 1, false, 0, 14) {

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, 0, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);

    std::vector<Nd4jLong> expectedOutputShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,oD,oH,oW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
    REQUIRE_TRUE(output->isSameShape(expectedOutputShape), 0, "AVGPOOL3DNEW OP: wrong shape of output array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedOutputShape).c_str(), ShapeUtils::shapeAsString(output).c_str());
@ -149,7 +149,7 @@ CUSTOM_OP_IMPL(avgpool3dnew_bp, 2, 1, false, 0, 14) {

    auto input = INPUT_VARIABLE(0);                          // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
    auto gradO = INPUT_VARIABLE(1);                          // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
-    auto gradI = OUTPUT_VARIABLE(0);                         // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon
+    auto gradI = OUTPUT_NULLIFIED(0);                         // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon

    const int kD = INT_ARG(0);                                                  // filter(kernel) depth
    const int kH = INT_ARG(1);                                                  // filter(kernel) height
@ -172,7 +172,7 @@ CUSTOM_OP_IMPL(avgpool3dnew_bp, 2, 1, false, 0, 14) {

    int bS, iC, iD, iH, iW, oC, oD, oH, oW;                     // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
    int indIOioC, indIOioD, indWoC, indWiC, indWkD;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
+    ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, 0, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);

    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,oD,oH,oW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
    std::vector<Nd4jLong> expectedGradIShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,iD,iH,iW,  0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp
@ -38,7 +38,7 @@ CUSTOM_OP_IMPL(maxpool2d, 1, 1, false, 0, 9) {
    REQUIRE_TRUE(input->rankOf() == 4, 0, "MAXPOOL2D OP: input array should have rank of 4, but got %i instead", input->rankOf());

    // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
-    auto output = OUTPUT_VARIABLE(0);
+    auto output = OUTPUT_NULLIFIED(0);

    const int kH = INT_ARG(0);
    const int kW = INT_ARG(1);
@ -150,7 +150,7 @@ CUSTOM_OP_IMPL(maxpool2d_bp, 2, 1, false, 0, 10) {

    auto input = INPUT_VARIABLE(0);                          // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
    auto gradO = INPUT_VARIABLE(1);                          // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-    auto gradI = OUTPUT_VARIABLE(0);                         // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    auto gradI = OUTPUT_NULLIFIED(0);                         // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon

    int kH = INT_ARG(0);                                                        // filter(kernel) height
    int kW = INT_ARG(1);                                                        // filter(kernel) width
@ -168,7 +168,7 @@ CUSTOM_OP_IMPL(maxpool2d_bp, 2, 1, false, 0, 10) {

    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, 0, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);

    std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,oH,oW,  0,indIOioC,indIiH,indIiH+1});
    std::vector<Nd4jLong> expectedGradIShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,iC,iH,iW,  0,indIOioC,indIiH,indIiH+1});
--- a/Show More
+++ b/Show More