From 98e28148798d91387409d0fe19c23d0b83518217 Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 11 Sep 2019 21:50:28 +0300 Subject: [PATCH] Platform helpers (#8216) * platform helpers draft Signed-off-by: raver119 * typo Signed-off-by: raver119 * disable platform cmake Signed-off-by: raver119 * another draft Signed-off-by: raver119 * mkldnn convolution refactored Signed-off-by: raver119 * minor tweaks Signed-off-by: raver119 * one more safety check Signed-off-by: raver119 * prototype works Signed-off-by: raver119 * meh Signed-off-by: raver119 * force static library mode for mkldnn Signed-off-by: raver119 * - ismax fix - experimental arg fix - don't enforce openblas on Apple hardware Signed-off-by: raver119 * bunch of small fixes Signed-off-by: raver119@gmail.com * declare concurrent Signed-off-by: raver119@gmail.com * - MKLDNN version upgrade to 1.0.2 - avgpool2d/maxpool2d APIs update Signed-off-by: raver119 * - avgpool2d_bp/maxpool2d_bp APIs update Signed-off-by: raver119 * - conv2d/batchnorm APIs update Signed-off-by: raver119 * - lrn/conv2d_bp/conv3d/conv3d_bp APIs update Signed-off-by: raver119 * all ops converted to MKLDNN 1.x Signed-off-by: raver119 * bunch of tweaks Signed-off-by: raver119 * namespace for platform helpers Signed-off-by: raver119 * make sure platform helpers aren't opimized out Signed-off-by: raver119 * build cpu_features on x86 systems Signed-off-by: raver119 * build cpu_features on x86 systems Signed-off-by: raver119 * more of cpu_features Signed-off-by: raver119 * - mkldnn removed from java - cpu_features checks in CpuNDArrayFactory Signed-off-by: raver119 * F16C definition renamed Signed-off-by: raver119 * some mkldnn rearrangements Signed-off-by: raver119 * check supported instructions before doing anything Signed-off-by: raver119 * typo Signed-off-by: raver119 * missied impl Signed-off-by: raver119 * BUILD_PIC option Signed-off-by: raver119 * conv2d fix Signed-off-by: raver119 * avgpool3d fix Signed-off-by: raver119 * avgpool3d_bp fix Signed-off-by: raver119 * avgpool2d_bp leak fix Signed-off-by: raver119 * avgpool3d_bp leak fix Signed-off-by: raver119 * maxpool bp leaks fixed Signed-off-by: raver119 * printf removed Signed-off-by: raver119 * batchnorm fix Signed-off-by: raver119 * AVX warning/error polishing Signed-off-by: AlexDBlack * Fix Signed-off-by: AlexDBlack * More polish Signed-off-by: AlexDBlack * Polish Signed-off-by: AlexDBlack * remove previous MKL-DNN support layer Signed-off-by: raver119 * avx2 tweak Signed-off-by: raver119 * allow static for apple Signed-off-by: raver119@gmail.com * exclude mkldnn in one more place Signed-off-by: raver119 * exclude mkldnn in one more place Signed-off-by: raver119 * restore OPENBLAS_PATH use Signed-off-by: raver119 * add runtime check for avx/avx2 support Signed-off-by: raver119 * convolution_auto Signed-off-by: raver119 * Add logic for helper argument * minor test fix Signed-off-by: raver119 * few tweaks Signed-off-by: raver119 * few tweaks Signed-off-by: raver119 * skip OpTracker props for non-x86 builds Signed-off-by: raver119 * linux arm isn't x86 :) Signed-off-by: raver119 * avx-512 Signed-off-by: raver119 * CUDA presets fix Signed-off-by: raver119 * BUILD_PIC Signed-off-by: raver119 * prefetchw for avx2 Signed-off-by: raver119 * BUILD_PIC again Signed-off-by: raver119 --- libnd4j/CMakeLists.txt | 108 ++- libnd4j/CMakeLists.txt.cpu_features.in | 16 + ...eLists.txt.in => CMakeLists.txt.mkldnn.in} | 6 +- libnd4j/blas/CMakeLists.txt | 33 +- libnd4j/blas/NativeOps.h | 7 + libnd4j/blas/cpu/NativeOps.cpp | 73 ++ libnd4j/blas/cuda/NativeOps.cu | 16 + libnd4j/buildnativeoperations.sh | 36 +- libnd4j/include/array/DataBuffer.h | 2 +- libnd4j/include/cblas.h | 5 +- libnd4j/include/execution/LaunchContext.h | 9 + .../include/execution/cpu/LaunchContext.cpp | 16 +- .../include/execution/cuda/LaunchContext.cu | 4 + libnd4j/include/graph/Context.h | 12 - libnd4j/include/graph/impl/Context.cpp | 3 - libnd4j/include/helpers/MKLDNNStream.h | 37 +- libnd4j/include/helpers/impl/OpTracker.cpp | 27 + libnd4j/include/helpers/shape.h | 18 +- libnd4j/include/ops/declarable/BooleanOp.h | 2 +- .../include/ops/declarable/BroadcastableOp.h | 2 +- .../ops/declarable/DeclarableCustomOp.h | 4 +- .../include/ops/declarable/DeclarableListOp.h | 2 +- .../ops/declarable/DeclarableReductionOp.h | 4 +- .../ops/declarable/LegacyBroadcastBoolOp.h | 6 +- .../ops/declarable/LegacyBroadcastOp.h | 6 +- .../ops/declarable/LegacyIndexReduceOp.h | 6 +- libnd4j/include/ops/declarable/LegacyOp.h | 4 +- .../LegacyPairwiseTransformBoolOp.h | 6 +- .../declarable/LegacyPairwiseTransformOp.h | 6 +- .../include/ops/declarable/LegacyRandomOp.h | 8 +- .../include/ops/declarable/LegacyReduce3Op.h | 6 +- .../ops/declarable/LegacyReduceBoolOp.h | 6 +- .../ops/declarable/LegacyReduceFloatOp.h | 6 +- .../ops/declarable/LegacyReduceLongOp.h | 6 +- .../ops/declarable/LegacyReduceSameOp.h | 6 +- .../ops/declarable/LegacyScalarBoolOp.h | 6 +- .../include/ops/declarable/LegacyScalarOp.h | 6 +- .../include/ops/declarable/LegacyStatsOp.h | 6 +- .../ops/declarable/LegacyTransformAnyOp.h | 6 +- .../ops/declarable/LegacyTransformBoolOp.h | 6 +- .../ops/declarable/LegacyTransformFloatOp.h | 6 +- .../ops/declarable/LegacyTransformSameOp.h | 6 +- .../ops/declarable/LegacyTransformStrictOp.h | 6 +- .../include/ops/declarable/OpRegistrator.h | 17 +- .../include/ops/declarable/PlatformHelper.h | 81 ++ .../ops/declarable/generic/nn/batchnorm.cpp | 126 --- .../generic/{ => nn}/convo/col2im.cpp | 0 .../generic/{ => nn}/convo/conv1d.cpp | 0 .../generic/{ => nn}/convo/conv2d.cpp | 0 .../generic/{ => nn}/convo/conv3d.cpp | 227 ----- .../generic/{ => nn}/convo/deconv2d.cpp | 0 .../generic/{ => nn}/convo/deconv2d_tf.cpp | 0 .../generic/{ => nn}/convo/deconv3d.cpp | 0 .../{ => nn}/convo/depthwiseConv2d.cpp | 0 .../generic/{ => nn}/convo/dilation2d.cpp | 0 .../generic/{ => nn}/convo/im2col.cpp | 0 .../generic/{ => nn}/convo/ismax.cpp | 0 .../{ => nn}/convo/pointwiseConv2d.cpp | 0 .../generic/{ => nn}/convo/sconv2d.cpp | 0 .../generic/{ => nn}/convo/upsampling2d.cpp | 0 .../generic/{ => nn}/convo/upsampling3d.cpp | 0 .../include/ops/declarable/generic/nn/lrn.cpp | 1 - .../{convo => nn}/pooling/avgpool2d.cpp | 0 .../{convo => nn}/pooling/avgpool3d.cpp | 0 .../{convo => nn}/pooling/maxpool2d.cpp | 0 .../{convo => nn}/pooling/maxpool3d.cpp | 0 .../pooling/maxpool_with_argmax.cpp | 0 .../{convo => nn}/pooling/pnormpool2d.cpp | 0 .../ops/declarable/helpers/convolutions.h | 41 - .../declarable/helpers/cpu/convolutions.cpp | 790 ------------------ .../ops/declarable/helpers/cpu/lrn.cpp | 98 --- .../ops/declarable/impl/DeclarableOp.cpp | 18 +- .../ops/declarable/impl/OpRegistrator.cpp | 30 + .../ops/declarable/impl/PlatformHelper.cpp | 86 ++ .../include/ops/declarable/platform/README.md | 1 + .../platform/mkldnn/avgpooling2d.cpp | 143 ++++ .../platform/mkldnn/avgpooling2d_bp.cpp | 153 ++++ .../platform/mkldnn/avgpooling3d.cpp | 145 ++++ .../platform/mkldnn/avgpooling3d_bp.cpp | 158 ++++ .../declarable/platform/mkldnn/batchnorm.cpp | 166 ++++ .../ops/declarable/platform/mkldnn/conv2d.cpp | 153 ++++ .../declarable/platform/mkldnn/conv2d_bp.cpp | 243 ++++++ .../ops/declarable/platform/mkldnn/conv3d.cpp | 167 ++++ .../platform/mkldnn/conv3dnew_bp.cpp | 263 ++++++ .../ops/declarable/platform/mkldnn/lrn.cpp | 97 +++ .../platform/mkldnn/maxpooling2d.cpp | 149 ++++ .../platform/mkldnn/maxpooling2d_bp.cpp | 178 ++++ .../platform/mkldnn/maxpooling3d.cpp | 155 ++++ .../platform/mkldnn/maxpooling_3d_bp.cpp | 185 ++++ .../platform/mkldnn/mkldnnUtils.cpp | 404 +++++++++ .../declarable/platform/mkldnn/mkldnnUtils.h | 124 +++ libnd4j/include/platform_boilerplate.h | 45 + libnd4j/include/types/float16.h | 6 +- libnd4j/pom.xml | 28 + libnd4j/tests_cpu/layers_tests/CMakeLists.txt | 2 +- .../layers_tests/ConvolutionTests1.cpp | 4 + .../tests_cpu/layers_tests/MklDnnTests.cpp | 70 ++ .../tests_cpu/libnd4j_tests/CMakeLists.txt | 58 +- .../java/org/nd4j/nativeblas/NativeOps.java | 6 + .../org/nd4j/nativeblas/Nd4jCudaPresets.java | 1 + .../nd4j-backend-impls/nd4j-native/pom.xml | 26 - .../cpu/nativecpu/CpuNDArrayFactory.java | 46 + .../java/org/nd4j/nativeblas/Nd4jCpu.java | 164 +++- .../org/nd4j/nativeblas/Nd4jCpuPresets.java | 9 +- .../org/nd4j/config/ND4JEnvironmentVars.java | 6 + 105 files changed, 3938 insertions(+), 1494 deletions(-) create mode 100644 libnd4j/CMakeLists.txt.cpu_features.in rename libnd4j/{tests_cpu/libnd4j_tests/CMakeLists.txt.in => CMakeLists.txt.mkldnn.in} (64%) create mode 100644 libnd4j/include/ops/declarable/PlatformHelper.h rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/col2im.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/conv1d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/conv2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/conv3d.cpp (61%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/deconv2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/deconv2d_tf.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/deconv3d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/depthwiseConv2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/dilation2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/im2col.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/ismax.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/pointwiseConv2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/sconv2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/upsampling2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{ => nn}/convo/upsampling3d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/avgpool2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/avgpool3d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/maxpool2d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/maxpool3d.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/maxpool_with_argmax.cpp (100%) rename libnd4j/include/ops/declarable/generic/{convo => nn}/pooling/pnormpool2d.cpp (100%) create mode 100644 libnd4j/include/ops/declarable/impl/PlatformHelper.cpp create mode 100644 libnd4j/include/ops/declarable/platform/README.md create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/conv2d_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/conv3dnew_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp create mode 100644 libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h create mode 100644 libnd4j/include/platform_boilerplate.h create mode 100644 libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt index 7d150b55b..e008608ad 100755 --- a/libnd4j/CMakeLists.txt +++ b/libnd4j/CMakeLists.txt @@ -8,12 +8,19 @@ set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) option(BUILD_TESTS "Build tests" OFF) +set(X86_BUILD false) + +if (NOT IOS_BUILD AND NOT ANDROID_BUILD AND NOT ${ARCH} MATCHES "power*" AND NOT ${ARCH} MATCHES "arm*") + set(X86_BUILD true) +endif() + # -fsanitize=address # -fsanitize=leak if (APPLE) - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D__APPLE_OS__=true -D_RELEASE=true") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true") set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D__APPLE_OS__=true") elseif(WIN32) + set(X86_BUILD true) if (NOT CUDA_BLAS) set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D_RELEASE=true") set(CMAKE_CXX_FLAGS_DEBUG " -g -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2") @@ -32,6 +39,7 @@ endif() if(NATIVE) IF(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") + set(X86_BUILD false) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=native") ELSE() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") @@ -39,21 +47,101 @@ if(NATIVE) endif() if(NOT CUDA_BLAS) - if (NOT "${MKLDNN_PATH}" STREQUAL "") - set(HAVE_MKLDNN 1) - include_directories(${MKLDNN_PATH}/include/) - link_directories(${MKLDNN_PATH} ${MKLDNN_PATH}/lib/) - IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux") - set(MKLDNN_LIBRARIES mkldnn mklml_intel) - else() - set(MKLDNN_LIBRARIES mkldnn mklml) + # we need this definition to avoid global memory use within mkldnn + add_definitions(-DMKLDNN_ENABLE_CONCURRENT_EXEC=true) + + # there's a chance, we have no BLAS provided externally + if ("${OPENBLAS_PATH}" STREQUAL "") + #we don't want static OpenBLAS on Apple + set(BLA_STATIC ON) + if (NOT APPLE) + set(BLA_VENDOR "OpenBLAS") endif() - elseif (NOT "${OPENBLAS_PATH}" STREQUAL "") + + # look around for system blas instead + find_package(BLAS REQUIRED) + if (BLAS_FOUND) + message("Original library: ${BLAS_LIBRARIES}") + # workaround for for cmake being unable to find static blas library + SET(_TMP_B "") + if (APPLE) + string(REGEX REPLACE "\\.dylib$" ".lib" _TMP_B "${BLAS_LIBRARIES}") + elseif (WIN32) + string(REGEX REPLACE "\\.dll" ".lib" _TMP_B "${BLAS_LIBRARIES}") + else() + string(REGEX REPLACE "\\.so$" ".a" _TMP_B "${BLAS_LIBRARIES}") + endif() + set(BLAS_LIBRARIES "${_TMP_B}") + + message("Found external BLAS implementation: ${BLAS_LIBRARIES} ") + add_definitions(-D__EXTERNAL_BLAS__=true) + endif() + else() + # if we have externally provided OPENBLAS_PATH - let's use it set(HAVE_OPENBLAS 1) include_directories(${OPENBLAS_PATH}/include/) link_directories(${OPENBLAS_PATH} ${OPENBLAS_PATH}/lib/) set(OPENBLAS_LIBRARIES openblas) endif() + + # building cpu_features + if (X86_BUILD) + add_definitions(-DCPU_FEATURES=true) + set(BUILD_PIC "ON" CACHE STRING "Hack to enforce fPIC mode" FORCE) + configure_file(./CMakeLists.txt.cpu_features.in cpu_features-download/CMakeLists.txt) + message("CMAKE_COMMAND: ${CMAKE_COMMAND}") + execute_process(COMMAND ${CMAKE_COMMAND} -DBUILD_PIC=ON -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cpu_features-download ) + + if(result) + message(FATAL_ERROR "CMake step for cpu_features failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cpu_features-download ) + if(result) + message(FATAL_ERROR "Build step for cpu_features failed: ${result}") + endif() + + add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/cpu_features-src + ${CMAKE_CURRENT_BINARY_DIR}/cpu_features-build + EXCLUDE_FROM_ALL) + set(CPUF_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cpu_features-src) + include_directories(${CPUF_SOURCE_DIR}/include) + set(CPU_FEATURES cpu_features) + endif() + + # new mkl-dnn entry + if (${HELPERS_mkldnn}) + message("Going to pull & build mkldnn") + set(HAVE_MKLDNN 1) + set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE) + + configure_file(./CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt) + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download ) + if(result) + message(FATAL_ERROR "CMake step for mkldnn failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download ) + if(result) + message(FATAL_ERROR "Build step for mkldnn failed: ${result}") + endif() + + add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src + ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build + EXCLUDE_FROM_ALL) + + set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build) + set(mkldnn_EXT_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src) + set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}") + include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR}) + set(MKLDNN mkldnn) + endif() endif() # Download and unpack flatbuffers at configure time diff --git a/libnd4j/CMakeLists.txt.cpu_features.in b/libnd4j/CMakeLists.txt.cpu_features.in new file mode 100644 index 000000000..da1d6ebda --- /dev/null +++ b/libnd4j/CMakeLists.txt.cpu_features.in @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 2.8.2) + +project(mkldnn-download NONE) + +include(ExternalProject) +ExternalProject_Add(mkldnn + GIT_REPOSITORY https://github.com/google/cpu_features.git + GIT_TAG v0.4.1 + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/cpu_features-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/cpu_features-build" + CONFIGURE_COMMAND "" + CMAKE_ARGS "-DBUILD_PIC=ON" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt.in b/libnd4j/CMakeLists.txt.mkldnn.in similarity index 64% rename from libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt.in rename to libnd4j/CMakeLists.txt.mkldnn.in index c4b2b6edb..26d82034f 100644 --- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt.in +++ b/libnd4j/CMakeLists.txt.mkldnn.in @@ -5,11 +5,11 @@ project(mkldnn-download NONE) include(ExternalProject) ExternalProject_Add(mkldnn GIT_REPOSITORY https://github.com/intel/mkl-dnn.git - GIT_TAG v0.18.1 + GIT_TAG v1.0.2 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build" - CONFIGURE_COMMAND "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src/scripts/prepare_mkl.sh" - CMAKE_ARGS -DMKLDNN_USE_MKL=ML -G \"Unix Makefiles\" -DMKLDNN_LIBRARY_TYPE=STATIC + CONFIGURE_COMMAND "" + CMAKE_ARGS -DMKLDNN_USE_MKL=ML -DMKLDNN_LIBRARY_TYPE=STATIC -G \"Unix Makefiles\" BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt index 257fa44bb..168b55ccf 100755 --- a/libnd4j/blas/CMakeLists.txt +++ b/libnd4j/blas/CMakeLists.txt @@ -78,14 +78,24 @@ IF(${ARCH} MATCHES "arm*") ELSEIF(${ARCH} MATCHES "power*") set(ARCH_TUNE "-mcpu=${ARCH} -mtune=${ARCH} -D__POWER") ELSEIF(${EXTENSION} MATCHES "avx2") - set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH} -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -D__F16C__=true") + message("Building AVX2 binary...") + set(ARCH_TUNE "-mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mprefetchwt1 -DSD_F16C=true -DF_AVX2=true") ELSE() if ("${ARCH}" STREQUAL "x86-64") + message("Building x86_64 binary...") set(ARCH_TYPE "generic") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DF_X64=true") else() set(ARCH_TYPE "${ARCH}") endif() + IF(${EXTENSION} MATCHES "avx512") + message("Building AVX512 binary...") + # we need to set flag here, that we can use hardware f16 conversion + tell that cpu features should be tracked + message("Current CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -mbmi -mbmi2 -mprefetchwt1 -mclflushopt -mxsavec -mxsaves -DSD_F16C=true -DF_AVX512=true") + endif() + set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}") ENDIF() @@ -299,19 +309,31 @@ elseif(CPU_BLAS) file(GLOB_RECURSE MEMORY_SOURCES false ../include/memory/*.cpp ../include/memory/*.h) file(GLOB_RECURSE GRAPH_SOURCES false ../include/graph/*.cpp ../include/graph/*.h) file(GLOB_RECURSE CUSTOMOPS_SOURCES false ../include/ops/declarable/generic/*.cpp) - file(GLOB_RECURSE CUSTOMOPS_HELPERS_SOURCES false ../include/ops/declarable/helpers/cpu/*.cpp ../include/ops/declarable/helpers/impl/*.cpp) + file(GLOB_RECURSE CUSTOMOPS_GENERIC_SOURCES false ../include/ops/declarable/helpers/cpu/*.cpp ../include/ops/declarable/helpers/impl/*.cpp) file(GLOB_RECURSE OPS_SOURCES false ../include/ops/impl/*.cpp ../include/ops/declarable/impl/*.cpp ../include/ops/*.h) file(GLOB_RECURSE INDEXING_SOURCES false ../include/indexing/*.cpp ../include/indexing/*.h) file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/*.cpp ../include/helpers/*.h) file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/*.cpp ../include/loops/*.h) + + #if MKLDNN is enabled - we're building mkldnn-powered helpers + if (HAVE_MKLDNN) + file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h) + endif() + + if (X86_BUILD) + #we disable platform optimizations for certains files + set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic") + set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic") + endif() + message("CPU BLAS") add_definitions(-D__CPUBLAS__=true) add_library(nd4jobj OBJECT cpu/NativeOps.cpp cpu/GraphExecutioner.cpp cpu/NativeOpExecutioner.cpp cpu/NDArray.cpp cpu/NDArrayFactory.cpp ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} - ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES} + ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES}) if(IOS) add_library(${LIBND4J_NAME} STATIC $) @@ -320,12 +342,13 @@ elseif(CPU_BLAS) add_library(${LIBND4J_NAME} SHARED $) endif() - target_link_libraries(${LIBND4J_NAME} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES}) + # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS + target_link_libraries(${LIBND4J_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES}) if ("${LIBND4J_ALL_OPS}" AND "${LIBND4J_BUILD_MINIFIER}") message(STATUS "Building minifier...") add_executable(minifier ../minifier/minifier.cpp ../minifier/graphopt.cpp) - target_link_libraries(minifier ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES}) + target_link_libraries(minifier ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES}) endif() if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 4.9) diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h index ef46e7752..06be5be04 100755 --- a/libnd4j/blas/NativeOps.h +++ b/libnd4j/blas/NativeOps.h @@ -1760,6 +1760,13 @@ ND4J_EXPORT Nd4jPointer lcCopyStream(OpaqueLaunchContext* lc); ND4J_EXPORT Nd4jPointer lcBlasHandle(OpaqueLaunchContext* lc); ND4J_EXPORT Nd4jPointer lcSolverHandle(OpaqueLaunchContext* lc); + +ND4J_EXPORT int binaryLevel(); +ND4J_EXPORT int optimalLevel(); + +ND4J_EXPORT bool isMinimalRequirementsMet(); +ND4J_EXPORT bool isOptimalRequirementsMet(); + } #endif //NATIVEOPERATIONS_NATIVEOPS_H diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index e016d58fe..2b5733523 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -76,6 +76,10 @@ bool experimentalSupport = false; #include #include +#ifdef CPU_FEATURES +#include +#endif + using namespace nd4j; void setElementThreshold(int num) { @@ -3167,6 +3171,75 @@ const char* lastErrorMessage() { return nd4j::LaunchContext::defaultContext()->errorReference()->errorMessage(); } +int binaryLevel() { +#ifdef CPU_FEATURES + + +#if defined(F_X64) + return 1; +#elif defined (F_AVX2) + return 2; +#elif defined (F_AVX512) + return 3; +#else + return 0; +#endif + +#else + return 0; +#endif +} + +int optimalLevel() { +#ifdef CPU_FEATURES + auto features = cpu_features::GetX86Info().features; + + if (features.avx && features.avx2 && features.avx512f && features.avx512vl && features.avx512bw && features.avx512dq && features.avx512cd) + return 3; + else if (features.avx && features.avx2) + return 2; + else + return 1; + +#else + return 0; +#endif +} + +bool isMinimalRequirementsMet() { +#ifdef CPU_FEATURES + auto features = cpu_features::GetX86Info().features; + +#if defined(F_X64) + return true; +#elif defined (F_AVX2) + return features.avx && features.avx2; +#elif defined (F_AVX512) + // we're optimizing for skylake-avx512 features, so we'll check those out + return features.avx && features.avx2 && features.avx512f && features.avx512vl && features.avx512bw && features.avx512dq && features.avx512cd; +#else + return true; +#endif + +#else + return true; +#endif +} + +bool isOptimalRequirementsMet() { +#ifdef CPU_FEATURES + auto b = ::binaryLevel(); + auto o = ::optimalLevel(); + + if (b == o) + return true; + else + return false; +#else + return true; +#endif +} + BUILD_SINGLE_TEMPLATE(template void pullRowsGeneric, (void *, Nd4jLong*, void*, Nd4jLong*, const int, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*, Nd4jLong*), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void tearGeneric, (void *, Nd4jLong*, Nd4jPointer*, Nd4jLong*, Nd4jLong*, Nd4jLong*), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void shuffleGeneric, (void**, Nd4jLong**, void**, Nd4jLong**, int, int*, Nd4jLong**, Nd4jLong**), LIBND4J_TYPES); diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index ec88de2e5..ee1bed052 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -3576,4 +3576,20 @@ int lastErrorCode() { const char* lastErrorMessage() { return nd4j::LaunchContext::defaultContext()->errorReference()->errorMessage(); +} + +int binaryLevel() { + return 0; +} + +int optimalLevel() { + return 0; +} + +bool isMinimalRequirementsMet() { + return true; +} + +bool isOptimalRequirementsMet() { + return true; } \ No newline at end of file diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh index d6c855957..599c4f250 100755 --- a/libnd4j/buildnativeoperations.sh +++ b/libnd4j/buildnativeoperations.sh @@ -53,6 +53,7 @@ CLEAN="false" MINIFIER="false" TESTS="false" VERBOSE="false" +HELPER= NAME= while [[ $# > 0 ]] do @@ -60,6 +61,10 @@ key="$1" value="${2:-}" #Build type (release/debug), packaging type, chip: cpu,cuda, lib type (static/dynamic) case $key in + -h|--helper) + HELPER="$value" + shift # past argument + ;; -o|-platform|--platform) OS="$value" shift # past argument @@ -425,7 +430,7 @@ if [ "$PACKAGING" == "msi" ]; then PACKAGING_ARG="-DPACKAGING=msi" fi -EXPERIMENTAL_ARG="no"; +EXPERIMENTAL_ARG=""; MINIFIER_ARG="-DLIBND4J_BUILD_MINIFIER=false" TESTS_ARG="-DBUILD_TESTS=OFF" NAME_ARG="-DLIBND4J_NAME=$NAME" @@ -461,16 +466,12 @@ if [ "$CHIP" == "cuda" ] && [ -n "$CHIP_VERSION" ]; then esac fi -[[ -z ${MKLDNN_PATH:-} ]] && MKLDNN_PATH="" [[ -z ${OPENBLAS_PATH:-} ]] && OPENBLAS_PATH="" if [[ -n "${BUILD_PATH:-}" ]]; then PREVIFS="$IFS" IFS="$BUILD_PATH_SEPARATOR" for P in $BUILD_PATH; do - if [[ -f "$P/include/mkldnn.h" ]]; then - MKLDNN_PATH="$P" - fi if [[ -f "$P/include/openblas_config.h" ]]; then OPENBLAS_PATH="$P" fi @@ -478,18 +479,12 @@ if [[ -n "${BUILD_PATH:-}" ]]; then IFS="$PREVIFS" fi -if [[ ! -f "$MKLDNN_PATH/include/mkldnn.h" ]]; then - echo "Could not find MKL-DNN, please make sure to run the build with Maven or set the MKLDNN_PATH variable" - MKLDNN_PATH="" -fi - if [[ ! -f "$OPENBLAS_PATH/include/openblas_config.h" ]]; then echo "Could not find OpenBLAS, please make sure to run the build with Maven or set the OPENBLAS_PATH variable" OPENBLAS_PATH="" fi # replace any backslash with a slash -MKLDNN_PATH="${MKLDNN_PATH//\\//}" OPENBLAS_PATH="${OPENBLAS_PATH//\\//}" mkbuilddir() { @@ -501,6 +496,21 @@ mkbuilddir() { cd "blasbuild/$CHIP" } +if [ "$HELPER" == "" ]; then + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + echo "!! !!" + echo "!! !!" + echo "!! !!" + echo "!! !!" + echo "!! WARNING! !!" + echo "!! No helper packages configured! !!" + echo "!! You can specify helper by using -h key. I.e. <-h mkldnn> !!" + echo "!! !!" + echo "!! !!" + echo "!! !!" + echo "!! !!" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" +fi echo PACKAGING = "${PACKAGING}" echo BUILD = "${BUILD}" @@ -515,11 +525,11 @@ echo OPERATIONS = "${OPERATIONS_ARG}" echo MINIFIER = "${MINIFIER_ARG}" echo TESTS = "${TESTS_ARG}" echo NAME = "${NAME_ARG}" -echo MKLDNN_PATH = "$MKLDNN_PATH" echo OPENBLAS_PATH = "$OPENBLAS_PATH" +echo HELPERS = "$HELPER" mkbuilddir pwd -eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DMKLDNN_PATH="$MKLDNN_PATH" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../.. +eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DHELPERS_"$HELPER"=true "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../.. if [ "$PARALLEL" == "true" ]; then MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ" fi diff --git a/libnd4j/include/array/DataBuffer.h b/libnd4j/include/array/DataBuffer.h index 0b77289bd..37d575b13 100644 --- a/libnd4j/include/array/DataBuffer.h +++ b/libnd4j/include/array/DataBuffer.h @@ -223,7 +223,7 @@ DataBuffer::DataBuffer(const size_t lenInBytes, const DataType dataType, memory: setCountersToZero(); - if(!lenInBytes == 0) { + if(lenInBytes != 0) { allocateBuffers(allocBoth); writeSpecial(); } diff --git a/libnd4j/include/cblas.h b/libnd4j/include/cblas.h index a6bdefa54..0d484a2e0 100755 --- a/libnd4j/include/cblas.h +++ b/libnd4j/include/cblas.h @@ -31,9 +31,10 @@ #endif #ifdef HAVE_MKLDNN +// FIXME: latest mkldnn doesn't ship mklml anymore? // include CBLAS from MKL-DNN -#include -#define CBLAS_H +//#include +//#define CBLAS_H #endif #ifdef HAVE_OPENBLAS diff --git a/libnd4j/include/execution/LaunchContext.h b/libnd4j/include/execution/LaunchContext.h index 5fae2162c..076e2933b 100644 --- a/libnd4j/include/execution/LaunchContext.h +++ b/libnd4j/include/execution/LaunchContext.h @@ -29,6 +29,10 @@ #include #endif +// used for MKLDNN etc +#if !defined(__STANDALONE_BUILD__) +#include "config.h" +#endif #include #include @@ -49,6 +53,9 @@ class ND4J_EXPORT LaunchContext { static std::vector> _contexts; static std::mutex _mutex; + // used for MKLDNN + void *_engine = nullptr; + #ifdef __CUDABLAS__ #ifndef __JAVACPP_HACK__ @@ -96,6 +103,8 @@ class ND4J_EXPORT LaunchContext { _workspace = theWorkspace; } + void* engine(); + int getDeviceID() const {return _deviceID;} void setDeviceID(int deviceID) { _deviceID = deviceID; } sd::ErrorReference* errorReference(); diff --git a/libnd4j/include/execution/cpu/LaunchContext.cpp b/libnd4j/include/execution/cpu/LaunchContext.cpp index 7b3070085..8812b7802 100644 --- a/libnd4j/include/execution/cpu/LaunchContext.cpp +++ b/libnd4j/include/execution/cpu/LaunchContext.cpp @@ -29,10 +29,16 @@ nd4j::ContextBuffers contextBuffers = nd4j::ContextBuffers(); thread_local nd4j::ContextBuffers contextBuffers = nd4j::ContextBuffers(); #endif +#ifdef HAVE_MKLDNN +#include +#endif + namespace nd4j { LaunchContext::~LaunchContext() { - +#ifdef HAVE_MKLDNN + delete reinterpret_cast(_engine); +#endif } std::vector> LaunchContext::_contexts = std::vector>(); @@ -42,6 +48,10 @@ namespace nd4j { // default constructor, just to make clang/ranlib happy _workspace = nullptr; _deviceID = 0; + +#ifdef HAVE_MKLDNN + _engine = new mkldnn::engine(mkldnn::engine::kind::cpu, 0); +#endif } LaunchContext::LaunchContext(Nd4jPointer cudaStream, Nd4jPointer reductionPointer, Nd4jPointer scalarPointer, Nd4jPointer allocationPointer) { @@ -73,4 +83,8 @@ namespace nd4j { sd::ErrorReference* LaunchContext::errorReference() { return contextBuffers.errorReference(); } + + void* LaunchContext::engine() { + return _engine; + } } \ No newline at end of file diff --git a/libnd4j/include/execution/cuda/LaunchContext.cu b/libnd4j/include/execution/cuda/LaunchContext.cu index 2e08131af..5e2ac589c 100644 --- a/libnd4j/include/execution/cuda/LaunchContext.cu +++ b/libnd4j/include/execution/cuda/LaunchContext.cu @@ -169,4 +169,8 @@ LaunchContext::LaunchContext() { sd::ErrorReference* LaunchContext::errorReference() { return contextBuffers.errorReference(); } + + void* LaunchContext::engine() { + return _engine; + } } \ No newline at end of file diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h index 129d584cc..f397d46f3 100644 --- a/libnd4j/include/graph/Context.h +++ b/libnd4j/include/graph/Context.h @@ -28,10 +28,6 @@ #include #include -#ifdef HAVE_MKLDNN -#include -#endif - // CUDA-specific includes #ifdef __CUDACC__ @@ -61,11 +57,6 @@ namespace nd4j { LaunchContext* _context = nullptr; std::vector _dataTypes; -#ifdef HAVE_MKLDNN - std::vector _mkldnnStreams; -#else - std::vector _mkldnnStreams; -#endif std::vector _fastpath_in; std::vector _fastpath_out; @@ -122,9 +113,6 @@ namespace nd4j { int getBranch(); void setBranch(int branch); -#ifdef HAVE_MKLDNN - std::vector& getMKLDNNStreams() { return _mkldnnStreams; } -#endif /** * * @return diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp index 920d66f84..085fa969e 100644 --- a/libnd4j/include/graph/impl/Context.cpp +++ b/libnd4j/include/graph/impl/Context.cpp @@ -98,9 +98,6 @@ namespace nd4j { this->_inputs.clear(); this->_fastpath_in.clear(); this->_fastpath_out.clear(); -#ifdef HAVE_MKLDNN - this->_mkldnnStreams.clear(); -#endif for (auto v:_handles) delete v; diff --git a/libnd4j/include/helpers/MKLDNNStream.h b/libnd4j/include/helpers/MKLDNNStream.h index 82eb0d30d..f88ec0e62 100644 --- a/libnd4j/include/helpers/MKLDNNStream.h +++ b/libnd4j/include/helpers/MKLDNNStream.h @@ -21,12 +21,11 @@ #ifndef LIBND4J_MKLDNNSTREAM_H #define LIBND4J_MKLDNNSTREAM_H -#ifndef __STANDALONE_BUILD__ +#if !defined(__STANDALONE_BUILD__) #include "config.h" #endif -#ifdef HAVE_MKLDNN -#include +#if defined(HAVE_MKLDNN) namespace nd4j { class MKLDNNStream { @@ -38,26 +37,24 @@ namespace nd4j { std::vector _floatArguments; std::vector _intArguments; - mkldnn::engine _engine = mkldnn::engine(mkldnn::engine::cpu, 0); - std::vector _memory; - std::vector _operations; - public: template static bool isSupported() { + // FIXME: strict float support doesn't work anymore return typeid(X) == typeid(float) && typeid(Y) == typeid(float); } static bool isSupported(const std::vector &arrays) { - for (auto i = arrays.begin(); i != arrays.end(); i++) { - if (*i != nullptr && (*i)->dataType() != nd4j::DataType::FLOAT32) { + // FIXME: strict float support doesn't work anymore + for (auto v:arrays) { + if (v != nullptr && v->dataType() != nd4j::DataType::FLOAT32) { return false; } } return true; } - MKLDNNStream(const std::string &opName) : _opName(opName) { } + explicit MKLDNNStream(const std::string &opName) : _opName(opName) { } bool checkAndReset(const std::vector &inputs, const std::vector &outputs, const std::vector &floatArguments, const std::vector &intArguments) { @@ -66,30 +63,10 @@ namespace nd4j { _outputs = outputs; _floatArguments = floatArguments; _intArguments = intArguments; - _operations.clear(); - _memory.clear(); return true; } return false; } - - const mkldnn::engine &getEngine() { return _engine; } - void setEngine(const mkldnn::engine &engine) { _engine = engine; } - - const std::vector &getMemory() { return _memory; } - void setMemory(const std::vector &memory) { _memory = memory; } - void addMemory(const mkldnn::memory &memory) { _memory.push_back(memory); } - - const std::vector &getOperations() { return _operations; } - void setOperations(const std::vector &operations) { _operations = operations; } - void addOperation(const mkldnn::primitive &operation) { _operations.push_back(operation); } - - bool submitAndWait(mkldnn::stream::kind kind = mkldnn::stream::kind::eager) { - nd4j_debug("Executing %s with MKL-DNN\n", _opName.c_str()); - // need to create a new one because already executed streams become unusable - mkldnn::stream stream(kind); - return stream.submit(_operations).wait(); - } }; } #endif diff --git a/libnd4j/include/helpers/impl/OpTracker.cpp b/libnd4j/include/helpers/impl/OpTracker.cpp index 4c2083f12..1fc4f330d 100644 --- a/libnd4j/include/helpers/impl/OpTracker.cpp +++ b/libnd4j/include/helpers/impl/OpTracker.cpp @@ -21,6 +21,8 @@ #include #include #include +#include + using namespace nd4j::ops; using namespace nd4j::graph; @@ -35,6 +37,31 @@ namespace nd4j { } void OpTracker::storeOperation(nd4j::graph::OpType opType, const OpDescriptor& descriptor) { + // check out CPU features + if (!::isMinimalRequirementsMet()) { + + auto binaryLevel = ::binaryLevel(); + auto optimalLevel = ::optimalLevel(); + + switch (binaryLevel) { + case 3: { + nd4j_printf("libnd4j binary was built with AVX512 support, but current CPU doesn't have this instruction set. Exiting now...",""); + } + break; + case 2: { + nd4j_printf("libnd4j binary was built with AVX/AVX2 support, but current CPU doesn't have this instruction set. Exiting now...",""); + } + break; + default: { + nd4j_printf("Unknown binary validation error. Exiting now...",""); + } + break; + } + + // we're exiting now + exit(119); + } + // if (_map.count(opType) < 1) { std::vector vec; _map[opType] = vec; diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index 705f06b99..538bd657d 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -4417,8 +4417,10 @@ INLINEDEF void calcOffsets(const Nd4jLong* shapeInfo, Nd4jLong* offsets, const c if(order == shape::order(shapeInfo) || e == 1) { // e==1 means common vector e = 1; Nd4jLong len = shape::length(shapeInfo); - while(e < len) - offsets[e++] = offsets[e - 1] + ews; + while(e < len) { + offsets[e] = offsets[e - 1] + ews; + e++; + } return; } } @@ -4464,8 +4466,10 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong if(shape[j] == 1) { --j; continue; } // ignore dimensions equal to unity if(j == rankMinusOne) { // last dimension - for(int l = 1; l < shape[j]; ++l) - offsets[i++] = offsets[i - 1] + strides[j]; + for(int l = 1; l < shape[j]; ++l) { + offsets[i] = offsets[i - 1] + strides[j]; + i++; + } --j; } else if(idx[j] < shape[j] - 1) { @@ -4489,8 +4493,10 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong if(shape[j] == 1) { ++j; continue; } // ignore dimensions equal to unity if(j == 0) { // last dimension - for(int l = 1; l < shape[j]; ++l) - offsets[i++] = offsets[i - 1] + strides[j]; + for(int l = 1; l < shape[j]; ++l) { + offsets[i] = offsets[i - 1] + strides[j]; + i++; + } ++j; } else if(idx[j] < shape[j] - 1) { diff --git a/libnd4j/include/ops/declarable/BooleanOp.h b/libnd4j/include/ops/declarable/BooleanOp.h index 2a8c031fd..b341ce394 100644 --- a/libnd4j/include/ops/declarable/BooleanOp.h +++ b/libnd4j/include/ops/declarable/BooleanOp.h @@ -32,7 +32,7 @@ namespace nd4j { OpDescriptor * _descriptor; bool prepareOutputs(Context& block); - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; public: BooleanOp(const char *name, int numInputs, bool scalar); ~BooleanOp(); diff --git a/libnd4j/include/ops/declarable/BroadcastableOp.h b/libnd4j/include/ops/declarable/BroadcastableOp.h index 374fd9d3d..bc2cddc59 100644 --- a/libnd4j/include/ops/declarable/BroadcastableOp.h +++ b/libnd4j/include/ops/declarable/BroadcastableOp.h @@ -30,7 +30,7 @@ namespace nd4j { namespace ops { class ND4J_EXPORT BroadcastableOp : public DeclarableCustomOp{ protected: - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; public: BroadcastableOp(const char *name, int numTArgs, int numIArgs); ~BroadcastableOp(); diff --git a/libnd4j/include/ops/declarable/DeclarableCustomOp.h b/libnd4j/include/ops/declarable/DeclarableCustomOp.h index 3dec5b15f..38cc20e71 100644 --- a/libnd4j/include/ops/declarable/DeclarableCustomOp.h +++ b/libnd4j/include/ops/declarable/DeclarableCustomOp.h @@ -30,12 +30,12 @@ namespace nd4j { /** * This method executes this Op */ - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; public: DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs); ~DeclarableCustomOp(); - virtual ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) = 0; + ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) override = 0; }; } } diff --git a/libnd4j/include/ops/declarable/DeclarableListOp.h b/libnd4j/include/ops/declarable/DeclarableListOp.h index 8ab38316a..6fa4fe086 100644 --- a/libnd4j/include/ops/declarable/DeclarableListOp.h +++ b/libnd4j/include/ops/declarable/DeclarableListOp.h @@ -32,7 +32,7 @@ namespace nd4j { namespace ops { class ND4J_EXPORT DeclarableListOp : public nd4j::ops::DeclarableOp { protected: - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; nd4j::NDArray* getZ(Context& block, int inputId); void setupResult(NDArray* array, Context& block); diff --git a/libnd4j/include/ops/declarable/DeclarableReductionOp.h b/libnd4j/include/ops/declarable/DeclarableReductionOp.h index 11f4ed853..4a75c5daf 100644 --- a/libnd4j/include/ops/declarable/DeclarableReductionOp.h +++ b/libnd4j/include/ops/declarable/DeclarableReductionOp.h @@ -30,12 +30,12 @@ namespace nd4j { /** * This method executes this Op */ - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; public: DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs); ~DeclarableReductionOp(); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h b/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h index 02866e32c..d72ed612d 100644 --- a/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h +++ b/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyBroadcastBoolOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override ; public: LegacyBroadcastBoolOp(); LegacyBroadcastBoolOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyBroadcastOp.h b/libnd4j/include/ops/declarable/LegacyBroadcastOp.h index b5cb0c80f..7502b6ce7 100644 --- a/libnd4j/include/ops/declarable/LegacyBroadcastOp.h +++ b/libnd4j/include/ops/declarable/LegacyBroadcastOp.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyBroadcastOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyBroadcastOp(); LegacyBroadcastOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h b/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h index acaa0ad54..b023cdc0d 100644 --- a/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h +++ b/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h @@ -32,13 +32,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyIndexReduceOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyIndexReduceOp(); LegacyIndexReduceOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyOp.h b/libnd4j/include/ops/declarable/LegacyOp.h index 953529d38..951f60165 100644 --- a/libnd4j/include/ops/declarable/LegacyOp.h +++ b/libnd4j/include/ops/declarable/LegacyOp.h @@ -41,13 +41,13 @@ namespace nd4j { int _numInputs = 0; // All Op classes provide own specific implementation for this method - virtual Nd4jStatus validateAndExecute(Context& block) = 0; + Nd4jStatus validateAndExecute(Context& block) override = 0; public: LegacyOp(int numInputs); LegacyOp(int numInputs, int opNum); // All Op classes provide own specific implementation for this method - virtual ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) = 0; + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override = 0; virtual LegacyOp* clone() = 0; }; } diff --git a/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h b/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h index 3e77d22d3..5a2eb431f 100644 --- a/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h +++ b/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyPairwiseTransformBoolOp: public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyPairwiseTransformBoolOp(); LegacyPairwiseTransformBoolOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h b/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h index 8785fd2b2..27a3a6f8d 100644 --- a/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h +++ b/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyPairwiseTransformOp: public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyPairwiseTransformOp(); LegacyPairwiseTransformOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyRandomOp.h b/libnd4j/include/ops/declarable/LegacyRandomOp.h index f1f5dc1e4..072825ef0 100644 --- a/libnd4j/include/ops/declarable/LegacyRandomOp.h +++ b/libnd4j/include/ops/declarable/LegacyRandomOp.h @@ -32,7 +32,7 @@ namespace nd4j { */ class ND4J_EXPORT LegacyRandomOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyRandomOp(); LegacyRandomOp(int opNum); @@ -43,10 +43,10 @@ namespace nd4j { nd4j::ResultSet* execute(nd4j::graph::RandomGenerator& rng, std::initializer_list inputs, std::initializer_list tArgs, std::initializer_list iArgs, bool isInplace = false); nd4j::ResultSet* execute(nd4j::graph::RandomGenerator& rng, std::vector& inputs, std::vector& tArgs, std::vector& iArgs, bool isInplace = false); - Nd4jStatus execute(Context* block); + Nd4jStatus execute(Context* block) override; - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyReduce3Op.h b/libnd4j/include/ops/declarable/LegacyReduce3Op.h index a3af784c9..9882f4cae 100644 --- a/libnd4j/include/ops/declarable/LegacyReduce3Op.h +++ b/libnd4j/include/ops/declarable/LegacyReduce3Op.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyReduce3Op : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyReduce3Op(); LegacyReduce3Op(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h b/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h index 3551e0a5f..e685cd38c 100644 --- a/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h +++ b/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h @@ -27,13 +27,13 @@ namespace nd4j { namespace ops { class ND4J_EXPORT LegacyReduceBoolOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyReduceBoolOp(); LegacyReduceBoolOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h b/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h index 499d91996..f85b98384 100644 --- a/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h +++ b/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h @@ -27,13 +27,13 @@ namespace nd4j { namespace ops { class ND4J_EXPORT LegacyReduceFloatOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyReduceFloatOp(); LegacyReduceFloatOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyReduceLongOp.h b/libnd4j/include/ops/declarable/LegacyReduceLongOp.h index e406e3f16..171739379 100644 --- a/libnd4j/include/ops/declarable/LegacyReduceLongOp.h +++ b/libnd4j/include/ops/declarable/LegacyReduceLongOp.h @@ -27,13 +27,13 @@ namespace nd4j { namespace ops { class ND4J_EXPORT LegacyReduceLongOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyReduceLongOp(); LegacyReduceLongOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyReduceSameOp.h b/libnd4j/include/ops/declarable/LegacyReduceSameOp.h index 10df372b2..daee7c16c 100644 --- a/libnd4j/include/ops/declarable/LegacyReduceSameOp.h +++ b/libnd4j/include/ops/declarable/LegacyReduceSameOp.h @@ -27,13 +27,13 @@ namespace nd4j { namespace ops { class ND4J_EXPORT LegacyReduceSameOp: public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyReduceSameOp(); LegacyReduceSameOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h b/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h index cc616b7e4..915caa980 100644 --- a/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h +++ b/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h @@ -30,15 +30,15 @@ namespace nd4j { */ class ND4J_EXPORT LegacyScalarBoolOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyScalarBoolOp(); LegacyScalarBoolOp(int opNum); LegacyScalarBoolOp(int opNum, NDArray &scalar); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyScalarOp.h b/libnd4j/include/ops/declarable/LegacyScalarOp.h index dc1b1a9fa..3cc000c85 100644 --- a/libnd4j/include/ops/declarable/LegacyScalarOp.h +++ b/libnd4j/include/ops/declarable/LegacyScalarOp.h @@ -30,15 +30,15 @@ namespace nd4j { */ class ND4J_EXPORT LegacyScalarOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context& block); + Nd4jStatus validateAndExecute(Context& block) override; public: LegacyScalarOp(); LegacyScalarOp(int opNum); LegacyScalarOp(int opNum, NDArray &scalar); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyStatsOp.h b/libnd4j/include/ops/declarable/LegacyStatsOp.h index 027459a7d..81ffd4d1b 100644 --- a/libnd4j/include/ops/declarable/LegacyStatsOp.h +++ b/libnd4j/include/ops/declarable/LegacyStatsOp.h @@ -30,13 +30,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyStatsOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyStatsOp(); LegacyStatsOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h b/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h index 09585d55b..34c30ba09 100644 --- a/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h +++ b/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h @@ -31,13 +31,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyTransformAnyOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyTransformAnyOp(); LegacyTransformAnyOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h b/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h index 9d3fb9006..23e761979 100644 --- a/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h +++ b/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h @@ -32,13 +32,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyTransformBoolOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyTransformBoolOp(); LegacyTransformBoolOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h b/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h index 70692fe8a..3327ad9f3 100644 --- a/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h +++ b/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h @@ -31,13 +31,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyTransformFloatOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyTransformFloatOp(); LegacyTransformFloatOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyTransformSameOp.h b/libnd4j/include/ops/declarable/LegacyTransformSameOp.h index 9dfca0ec8..7b847562b 100644 --- a/libnd4j/include/ops/declarable/LegacyTransformSameOp.h +++ b/libnd4j/include/ops/declarable/LegacyTransformSameOp.h @@ -32,13 +32,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyTransformSameOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyTransformSameOp(); LegacyTransformSameOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h b/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h index 9e1ba6e75..4d1722b01 100644 --- a/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h +++ b/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h @@ -32,13 +32,13 @@ namespace nd4j { */ class ND4J_EXPORT LegacyTransformStrictOp : public LegacyOp { protected: - Nd4jStatus validateAndExecute(Context &block); + Nd4jStatus validateAndExecute(Context &block) override; public: LegacyTransformStrictOp(); LegacyTransformStrictOp(int opNum); - ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block); - virtual LegacyOp* clone(); + ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; + LegacyOp* clone() override; }; } } diff --git a/libnd4j/include/ops/declarable/OpRegistrator.h b/libnd4j/include/ops/declarable/OpRegistrator.h index 35feb509c..effb71c67 100644 --- a/libnd4j/include/ops/declarable/OpRegistrator.h +++ b/libnd4j/include/ops/declarable/OpRegistrator.h @@ -26,6 +26,7 @@ #include #include #include +#include // handlers part #include @@ -59,10 +60,16 @@ namespace nd4j { std::map _msvc; + // pointers to our operations std::map _declarablesLD; std::map _declarablesD; std::vector _uniqueD; + // pointers to platform-specific helpers + std::map _helpersLH; + std::map _helpersH; + std::vector _uniqueH; + std::mutex _locker; std::string _opsList; bool isInit = false; @@ -82,16 +89,22 @@ namespace nd4j { const char * getAllCustomOperations(); /** - * This method registers operation + * This method registers operation in our registry, so we can use them later * * @param op */ bool registerOperation(const char* name, nd4j::ops::DeclarableOp* op); bool registerOperation(nd4j::ops::DeclarableOp *op); + void registerHelper(nd4j::ops::platforms::PlatformHelper* op); + + bool hasHelper(Nd4jLong hash); + nd4j::ops::DeclarableOp* getOperation(const char *name); nd4j::ops::DeclarableOp* getOperation(Nd4jLong hash); - nd4j::ops::DeclarableOp* getOperation(std::string& name); + nd4j::ops::DeclarableOp* getOperation(std::string &name); + + nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash); std::vector getAllHashes(); diff --git a/libnd4j/include/ops/declarable/PlatformHelper.h b/libnd4j/include/ops/declarable/PlatformHelper.h new file mode 100644 index 000000000..6fbbae3b8 --- /dev/null +++ b/libnd4j/include/ops/declarable/PlatformHelper.h @@ -0,0 +1,81 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SD_PLATFORMHELPER_H +#define SD_PLATFORMHELPER_H + +#include +#include +#include +#include +#include + +namespace nd4j { + namespace ops { + namespace platforms { + /** + * This abstract class defines methods used by platform-specific helpers implementations + */ + class ND4J_EXPORT PlatformHelper { + protected: + // name of the operation this helper is built for + std::string _name; + + // hash of the operation this helper is built for + Nd4jLong _hash; + public: + PlatformHelper(const char *name); + + ~PlatformHelper() = default; + + std::string name(); + + Nd4jLong hash(); + + /** + * This method checks, if given helper can be used with given input/output/configuration options + * + * @param context + * @return + */ + virtual bool isUsable(graph::Context &context) = 0; + + /** + * This method invokes helper. Typically this method replaces actual op execution + * + * @param context + * @return + */ + virtual Nd4jStatus invokeHelper(graph::Context &context) = 0; + + /** + * Helper method, needed for compatibility with DeclarableOp macros + * @param ctx + * @param inputId + * @return + */ + nd4j::NDArray *getZ(graph::Context &ctx, int inputId); + }; + } + } +} + + +#endif //SD_PLATFORMHELPER_H diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp index eafa266dd..6ef4a49d5 100644 --- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp @@ -28,54 +28,6 @@ namespace nd4j { namespace ops { -#ifdef HAVE_MKLDNN -using namespace mkldnn; - -static void getMKLDNNMemoryDescBatchNorm(const NDArray* src, const NDArray* diff_src, const NDArray* dst, - mkldnn::memory::desc* batchnorm_src_md, mkldnn::memory::desc* batchnorm_diff_src_md, mkldnn::memory::desc* batchnorm_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis) { - const Nd4jLong* shape = src->getShapeInfo(); - Nd4jLong rank = shape[0]; - Nd4jLong dim1 = axis; // MKL-DNN supports only 1 axis, which has to be the "channel" one - Nd4jLong dim2 = axis >= 2 ? 1 : 2; - Nd4jLong dim3 = axis >= 3 ? 2 : 3; - mkldnn::memory::dims batchnorm_src_tz = { (int)shape[1], (int)shape[dim1 + 1], rank > 2 ? (int)shape[dim2 + 1] : 1, rank > 3 ? (int)shape[dim3 + 1] : 1}; - - auto type = mkldnn::memory::data_type::f32; - auto format = mkldnn::memory::format::nchw; - auto supposed_to_be_any_format = mkldnn::memory::format::nChw8c; // doesn't work with "any" - - if (src != nullptr && src->getBuffer() != nullptr && batchnorm_src_md != nullptr) { - *batchnorm_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); - *user_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; // overrides format - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[dim1]; - user_src_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? src->stridesOf()[dim2] : 1; - user_src_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? src->stridesOf()[dim3] : 1; - } - - if (diff_src != nullptr && diff_src->getBuffer() != nullptr && batchnorm_diff_src_md != nullptr) { - *batchnorm_diff_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); - *user_diff_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; // overrides format - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[dim1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? diff_src->stridesOf()[dim2] : 1; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? diff_src->stridesOf()[dim3] : 1; - } - - if (dst != nullptr && dst->getBuffer() != nullptr && batchnorm_dst_md != nullptr) { - *batchnorm_dst_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); - *user_dst_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; // overrides format - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[dim1]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? dst->stridesOf()[dim2] : 1; - user_dst_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? dst->stridesOf()[dim3] : 1; - } -} -#endif CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) { auto input = INPUT_VARIABLE(0); @@ -208,84 +160,6 @@ CUSTOM_OP_IMPL(batchnorm_new, 3, 1, false, 1, 2) { for(int i = 1; i < block.width(); ++i) REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_NEW op: types of all input arrays should be the same !"); -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, mean, variance, gamma, beta, output}) && numOfAxes == 1) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("batchnorm_new")); - } - - std::vector shape({2, mean->lengthOf()}); - NDArray weights = NDArrayFactory::create('c', shape, block.launchContext()); - weights({0, 1, 0, 0}).assign(1.0f); - weights({1, 2, 0, 0}).assign(0.0f); - - if (streams[0].checkAndReset({input, mean, variance, gamma, beta}, {output}, {(float)epsilon}, axes)) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc batchnorm_src_md(empty), batchnorm_dst_md(empty), user_src_md(empty), user_dst_md(empty); - - getMKLDNNMemoryDescBatchNorm(input, nullptr, output, - &batchnorm_src_md, nullptr, &batchnorm_dst_md, - &user_src_md, nullptr, &user_dst_md, axes[0]); - - auto batchnorm_desc = batch_normalization_forward::desc(prop_kind::forward_inference, batchnorm_src_md, epsilon, - use_global_stats | (applyScale || applyOffset ? use_scale_shift : 0)); - - auto engine = streams[0].getEngine(); - auto batchnorm_prim_desc = batch_normalization_forward::primitive_desc(batchnorm_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, input->buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output->buffer()); - auto batchnorm_mean_memory = mkldnn::memory(batchnorm_prim_desc.mean_primitive_desc(), mean->buffer()); - auto batchnorm_variance_memory = mkldnn::memory(batchnorm_prim_desc.variance_primitive_desc(), variance->buffer()); - - auto batchnorm_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc({batchnorm_src_md, engine}) - != user_src_memory.get_primitive_desc()) { - batchnorm_src_memory = mkldnn::memory({batchnorm_src_md, engine}); - streams[0].addMemory(batchnorm_src_memory); - streams[0].addOperation(reorder(user_src_memory, batchnorm_src_memory)); - } - - auto batchnorm_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(batchnorm_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - batchnorm_dst_memory = mkldnn::memory(batchnorm_prim_desc.dst_primitive_desc()); - streams[0].addMemory(batchnorm_dst_memory); - } - - streams[0].addMemory(batchnorm_mean_memory); - streams[0].addMemory(batchnorm_variance_memory); - - if (applyScale || applyOffset) { - auto batchnorm_weights_memory = mkldnn::memory(batchnorm_prim_desc.weights_primitive_desc(), weights.buffer()); - streams[0].addMemory(batchnorm_weights_memory); - streams[0].addOperation(batch_normalization_forward(batchnorm_prim_desc, (mkldnn::primitive::at)batchnorm_src_memory, - (mkldnn::primitive::at)batchnorm_mean_memory, (mkldnn::primitive::at)batchnorm_variance_memory, (mkldnn::primitive::at)batchnorm_weights_memory, batchnorm_dst_memory)); - } else { - streams[0].addOperation(batch_normalization_forward(batchnorm_prim_desc, (mkldnn::primitive::at)batchnorm_src_memory, - (mkldnn::primitive::at)batchnorm_mean_memory, (mkldnn::primitive::at)batchnorm_variance_memory, batchnorm_dst_memory)); - } - - if (mkldnn::memory::primitive_desc(batchnorm_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(batchnorm_dst_memory, user_dst_memory)); - } - } - - if (applyScale || applyOffset) { - if (gamma != nullptr) { - weights({0, 1, 0, 0}).assign(gamma); - } - if (beta != nullptr) { - weights({1, 2, 0, 0}).assign(beta); - } - } - streams[0].submitAndWait(); - return Status::OK(); - } -#endif nd4j_debug("MKL-DNN is not used for batchnorm_new!\n", 0); // formula: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta diff --git a/libnd4j/include/ops/declarable/generic/convo/col2im.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/col2im.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/conv1d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/conv1d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/conv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/conv2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp similarity index 61% rename from libnd4j/include/ops/declarable/generic/convo/conv3d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp index 6370579d2..f7dddec32 100644 --- a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp @@ -29,12 +29,7 @@ namespace nd4j { namespace ops { -#ifdef HAVE_MKLDNN -using namespace mkldnn; -#endif - CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) { - auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] @@ -70,83 +65,6 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) { if(isSameMode) // SAME ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, weights, bias, output})) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("conv3dnew")); - } - - if (streams[0].checkAndReset({input, weights, bias}, {output}, {}, {kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, isNCDHW})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc conv_src_md(empty), conv_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_weights_md(empty), user_bias_md(empty), user_dst_md(empty); - mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; - - ConvolutionUtils::getMKLDNNMemoryDescConv3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, isNCDHW, - bS, iC, iD, iH, iW, oC, oD, oH, oW, input, nullptr, weights, nullptr, bias, output, - &conv_src_md, nullptr, &conv_weights_md, nullptr, &conv_bias_md, &conv_dst_md, - &user_src_md, nullptr, &user_weights_md, nullptr, &user_bias_md, &user_dst_md, - conv_strides, conv_padding, conv_padding_r); - - auto conv_desc = bias != nullptr - ? convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input)->buffer()); - auto user_weights_memory = mkldnn::memory({user_weights_md, engine}, const_cast(weights)->buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output->buffer()); - - auto conv_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - conv_src_memory = mkldnn::memory(conv_prim_desc.src_primitive_desc()); - streams[0].addMemory(conv_src_memory); - streams[0].addOperation(reorder(user_src_memory, conv_src_memory)); - } - - auto conv_weights_memory = user_weights_memory; - streams[0].addMemory(user_weights_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.weights_primitive_desc()) - != user_weights_memory.get_primitive_desc()) { - conv_weights_memory = mkldnn::memory(conv_prim_desc.weights_primitive_desc()); - streams[0].addMemory(conv_weights_memory); - streams[0].addOperation(reorder(user_weights_memory, conv_weights_memory)); - } - - auto conv_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - conv_dst_memory = mkldnn::memory(conv_prim_desc.dst_primitive_desc()); - streams[0].addMemory(conv_dst_memory); - } - - if (bias != nullptr) { - auto conv_bias_memory = mkldnn::memory(conv_prim_desc.bias_primitive_desc(), bias->buffer()); - streams[0].addMemory(conv_bias_memory); - streams[0].addOperation(convolution_forward(conv_prim_desc, conv_src_memory, conv_weights_memory, conv_bias_memory, conv_dst_memory)); - } else { - streams[0].addOperation(convolution_forward(conv_prim_desc, conv_src_memory, conv_weights_memory, conv_dst_memory)); - } - - if (mkldnn::memory::primitive_desc(conv_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(conv_dst_memory, user_dst_memory)); - } - } - - streams[0].submitAndWait(); - return Status::OK(); - } -#endif nd4j_debug("MKL-DNN is not used for conv3dnew!\n", 0); std::vector permutForOutput; @@ -297,151 +215,6 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) { if(isSameMode) // SAME ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB})) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("conv3dnew_bp_weights")); - streams.push_back(MKLDNNStream("conv3dnew_bp_data")); - } - - bool resetW = streams[0].checkAndReset({input, weights, bias, gradO}, {gradI, gradW, gradB}, {}, {kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, isNDHWC}); - bool resetI = streams[1].checkAndReset({input, weights, bias, gradO}, {gradI, gradW, gradB}, {}, {kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, isNDHWC}); - if (resetW || resetI) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc conv_src_md(empty), conv_diff_src_md(empty), conv_weights_md(empty), - conv_diff_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_weights_md(empty), - user_diff_weights_md(empty), user_bias_md(empty), user_dst_md(empty); - mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; - - ConvolutionUtils::getMKLDNNMemoryDescConv3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, isNDHWC, - bS, iC, iD, iH, iW, oC, oD, oH, oW, input, gradI, weights, gradW, gradB, gradO, - &conv_src_md, &conv_diff_src_md, &conv_weights_md, &conv_diff_weights_md, &conv_bias_md, &conv_dst_md, - &user_src_md, &user_diff_src_md, &user_weights_md, &user_diff_weights_md, &user_bias_md, &user_dst_md, - conv_strides, conv_padding, conv_padding_r); - - auto conv_desc = gradB != nullptr - ? convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, streams[0].getEngine()); - - if (gradW != nullptr) { - auto convW_desc = gradB != nullptr - ? convolution_backward_weights::desc( - convolution_direct, conv_src_md, conv_diff_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_backward_weights::desc( - convolution_direct, conv_src_md, conv_diff_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto convW_prim_desc = convolution_backward_weights::primitive_desc(convW_desc, engine, conv_prim_desc); - auto userW_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input)->buffer()); - auto userW_weights_memory = mkldnn::memory({user_diff_weights_md, engine}, gradW->buffer()); - auto userW_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO)->buffer()); - - auto convW_src_memory = userW_src_memory; - streams[0].addMemory(userW_src_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.src_primitive_desc()) - != userW_src_memory.get_primitive_desc()) { - convW_src_memory = mkldnn::memory(convW_prim_desc.src_primitive_desc()); - streams[0].addMemory(convW_src_memory); - streams[0].addOperation(reorder(userW_src_memory, convW_src_memory)); - } - - auto convW_weights_memory = userW_weights_memory; - streams[0].addMemory(userW_weights_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_weights_primitive_desc()) - != userW_weights_memory.get_primitive_desc()) { - convW_weights_memory = mkldnn::memory(convW_prim_desc.diff_weights_primitive_desc()); - streams[0].addMemory(convW_weights_memory); - } - - auto convW_dst_memory = userW_dst_memory; - streams[0].addMemory(userW_dst_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_dst_primitive_desc()) - != userW_dst_memory.get_primitive_desc()) { - convW_dst_memory = mkldnn::memory(convW_prim_desc.diff_dst_primitive_desc()); - streams[0].addMemory(convW_dst_memory); - streams[0].addOperation(reorder(userW_dst_memory, convW_dst_memory)); - } - - if (gradB != nullptr) { - auto convW_bias_memory = mkldnn::memory(convW_prim_desc.diff_bias_primitive_desc(), gradB->buffer()); - streams[0].addMemory(convW_bias_memory); - streams[0].addOperation(convolution_backward_weights(convW_prim_desc, convW_src_memory, convW_dst_memory, convW_weights_memory, convW_bias_memory)); - } else { - streams[0].addOperation(convolution_backward_weights(convW_prim_desc, convW_src_memory, convW_dst_memory, convW_weights_memory)); - } - - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_weights_primitive_desc()) - != userW_weights_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(convW_weights_memory, userW_weights_memory)); - } - } - - if (gradI != nullptr) { - auto convI_desc = - convolution_backward_data::desc( - convolution_direct, conv_diff_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[1].getEngine(); - auto convI_prim_desc = convolution_backward_data::primitive_desc(convI_desc, engine, conv_prim_desc); - auto userI_src_memory = mkldnn::memory({user_diff_src_md, engine}, gradI->buffer()); - auto userI_weights_memory = mkldnn::memory({user_weights_md, engine}, const_cast(weights)->buffer()); - auto userI_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO)->buffer()); - - auto convI_src_memory = userI_src_memory; - streams[1].addMemory(userI_src_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_src_primitive_desc()) - != userI_src_memory.get_primitive_desc()) { - convI_src_memory = mkldnn::memory(convI_prim_desc.diff_src_primitive_desc()); - streams[1].addMemory(convI_src_memory); - } - - auto convI_weights_memory = userI_weights_memory; - streams[1].addMemory(userI_weights_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.weights_primitive_desc()) - != userI_weights_memory.get_primitive_desc()) { - convI_weights_memory = mkldnn::memory(convI_prim_desc.weights_primitive_desc()); - streams[1].addMemory(convI_weights_memory); - streams[1].addOperation(reorder(userI_weights_memory, convI_weights_memory)); - } - - auto convI_dst_memory = userI_dst_memory; - streams[1].addMemory(userI_dst_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_dst_primitive_desc()) - != userI_dst_memory.get_primitive_desc()) { - convI_dst_memory = mkldnn::memory(convI_prim_desc.diff_dst_primitive_desc()); - streams[1].addMemory(convI_dst_memory); - streams[1].addOperation(reorder(userI_dst_memory, convI_dst_memory)); - } - - streams[1].addOperation(convolution_backward_data(convI_prim_desc, convI_dst_memory, convI_weights_memory, convI_src_memory)); - - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_src_primitive_desc()) - != userI_src_memory.get_primitive_desc()) { - streams[1].addOperation(reorder(convI_src_memory, userI_src_memory)); - } - } - } - - if (gradW != nullptr) { - streams[0].submitAndWait(); - } - if (gradI != nullptr) { - streams[1].submitAndWait(); - } - return Status::OK(); - } -#endif nd4j_debug("MKL-DNN is not used for conv3dnew_bp!\n", 0); std::vector gradOaxesForDot; diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv2d_tf.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/deconv2d_tf.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/depthwiseConv2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/dilation2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/dilation2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/im2col.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/im2col.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/ismax.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/ismax.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pointwiseConv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pointwiseConv2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/sconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/sconv2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/upsampling2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/upsampling2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/upsampling3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/upsampling3d.cpp rename to libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp diff --git a/libnd4j/include/ops/declarable/generic/nn/lrn.cpp b/libnd4j/include/ops/declarable/generic/nn/lrn.cpp index eabee6cad..17a2d7175 100644 --- a/libnd4j/include/ops/declarable/generic/nn/lrn.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/lrn.cpp @@ -41,7 +41,6 @@ namespace nd4j { REQUIRE_TRUE(input->rankOf() == 4, 0, "lrn: Input rank of 4 expected, but got %i instead", input->rankOf()); - // FIXME: double? double alpha = T_ARG(1); double beta = T_ARG(2); double bias = T_ARG(0); diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/avgpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/avgpool2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/avgpool3d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/avgpool3d.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/maxpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/maxpool2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/maxpool3d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/maxpool3d.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/maxpool_with_argmax.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/maxpool_with_argmax.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp diff --git a/libnd4j/include/ops/declarable/generic/convo/pooling/pnormpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp similarity index 100% rename from libnd4j/include/ops/declarable/generic/convo/pooling/pnormpool2d.cpp rename to libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp diff --git a/libnd4j/include/ops/declarable/helpers/convolutions.h b/libnd4j/include/ops/declarable/helpers/convolutions.h index 484a6345c..fc7c41034 100644 --- a/libnd4j/include/ops/declarable/helpers/convolutions.h +++ b/libnd4j/include/ops/declarable/helpers/convolutions.h @@ -24,9 +24,6 @@ #include #include -#ifdef HAVE_MKLDNN -#include -#endif #include namespace nd4j { @@ -197,44 +194,6 @@ namespace nd4j { } -#ifdef HAVE_MKLDNN - static void getMKLDNNMemoryDescConv2d( - int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, bool isSameMode, bool isNCHW, - int bS, int iC, int iH, int iW, int oC, int oH, int oW, const NDArray* src, const NDArray* diff_src, - const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, - mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, - mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, - mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r); - - static void getMKLDNNMemoryDescConv3d( - int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, bool isSameMode, bool isNCDHW, - int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, const NDArray* src, const NDArray* diff_src, - const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, - mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, - mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, - mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r); - - static void getMKLDNNMemoryDescPool2d( - int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, int poolingMode, int extraParam0, bool isNCHW, - int bS, int iC, int iH, int iW, int oC, int oH, int oW, - const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, - mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r); - - static void getMKLDNNMemoryDescPool3d( - int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, int poolingMode, int extraParam0, bool isNCDHW, - int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, - const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, - mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r); -#endif - static void conv2d(nd4j::graph::Context &context, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW); // static void conv2d(nd4j::graph::Context & block, const std::vector& inArrs, NDArray* output, const std::vector& intArgs); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index 3d04bc129..a03e1f7ac 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -28,121 +28,6 @@ namespace nd4j { namespace ops { -#ifdef HAVE_MKLDNN - using namespace mkldnn; - -void ConvolutionUtils::getMKLDNNMemoryDescPool2d( - int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, int poolingMode, int extraParam0, bool isNCHW, - int bS, int iC, int iH, int iW, int oC, int oH, int oW, - const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, - mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r) { - mkldnn::memory::dims pool_src_tz = { bS, iC, iH, iW }; - mkldnn::memory::dims pool_dst_tz = { bS, oC, oH, oW }; - - pool_strides = { sH, sW }; - pool_kernel = { kH, kW }; - pool_padding = { pH, pW }; - pool_padding_r = { (oH - 1) * sH - iH + kH - pH, - (oW - 1) * sW - iW + kW - pW }; - - algorithm = poolingMode == 0 ? pooling_max - : extraParam0 == 0 ? pooling_avg_exclude_padding - : pooling_avg_include_padding; - auto type = mkldnn::memory::data_type::f32; - auto format = isNCHW ? mkldnn::memory::format::nchw : mkldnn::memory::format::nhwc; - auto supposed_to_be_any_format = mkldnn::memory::format::nChw8c; // doesn't work with "any" - - if (src != nullptr && src->getBuffer() != nullptr && pool_src_md != nullptr) { - *pool_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); - *user_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[isNCHW ? 0 : 0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[isNCHW ? 1 : 3]; - user_src_md->data.layout_desc.blocking.strides[0][2] = src->stridesOf()[isNCHW ? 2 : 1]; - user_src_md->data.layout_desc.blocking.strides[0][3] = src->stridesOf()[isNCHW ? 3 : 2]; - } - - if (diff_src != nullptr && diff_src->getBuffer() != nullptr && pool_diff_src_md != nullptr) { - *pool_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); - *user_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[isNCHW ? 0 : 0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[isNCHW ? 1 : 3]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = diff_src->stridesOf()[isNCHW ? 2 : 1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = diff_src->stridesOf()[isNCHW ? 3 : 2]; - } - - if (dst != nullptr && dst->getBuffer() != nullptr && pool_dst_md != nullptr) { - *pool_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, supposed_to_be_any_format); - *user_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[isNCHW ? 0 : 0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[isNCHW ? 1 : 3]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = dst->stridesOf()[isNCHW ? 2 : 1]; - user_dst_md->data.layout_desc.blocking.strides[0][3] = dst->stridesOf()[isNCHW ? 3 : 2]; - } -} - -void ConvolutionUtils::getMKLDNNMemoryDescPool3d( - int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, int poolingMode, int extraParam0, bool isNCDHW, - int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, - const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, - mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r) { - mkldnn::memory::dims pool_src_tz = { bS, iC, iD, iH, iW }; - mkldnn::memory::dims pool_dst_tz = { bS, oC, oD, oH, oW }; - - pool_strides = { sD, sH, sW }; - pool_kernel = { kD, kH, kW }; - pool_padding = { pD, pH, pW }; - pool_padding_r = { (oD - 1) * sD - iD + kD - pD, - (oH - 1) * sH - iH + kH - pH, - (oW - 1) * sW - iW + kW - pW }; - - algorithm = poolingMode == 0 ? pooling_max - : extraParam0 == 0 ? pooling_avg_exclude_padding - : pooling_avg_include_padding; - auto type = mkldnn::memory::data_type::f32; - auto format = isNCDHW ? mkldnn::memory::format::ncdhw : mkldnn::memory::format::ndhwc; - auto supposed_to_be_any_format = mkldnn::memory::format::nCdhw8c; // doesn't work with "any" - - if (src != nullptr && src->getBuffer() != nullptr && pool_src_md != nullptr) { - *pool_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); - *user_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[isNCDHW ? 0 : 0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[isNCDHW ? 1 : 4]; - user_src_md->data.layout_desc.blocking.strides[0][2] = src->stridesOf()[isNCDHW ? 2 : 1]; - user_src_md->data.layout_desc.blocking.strides[0][3] = src->stridesOf()[isNCDHW ? 3 : 2]; - user_src_md->data.layout_desc.blocking.strides[0][4] = src->stridesOf()[isNCDHW ? 4 : 3]; - } - - if (diff_src != nullptr && diff_src->getBuffer() != nullptr && pool_diff_src_md != nullptr) { - *pool_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); - *user_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[isNCDHW ? 0 : 0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[isNCDHW ? 1 : 4]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = diff_src->stridesOf()[isNCDHW ? 2 : 1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = diff_src->stridesOf()[isNCDHW ? 3 : 2]; - user_diff_src_md->data.layout_desc.blocking.strides[0][4] = diff_src->stridesOf()[isNCDHW ? 4 : 3]; - } - - if (dst != nullptr && dst->getBuffer() != nullptr && pool_dst_md != nullptr) { - *pool_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, supposed_to_be_any_format); - *user_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[isNCDHW ? 0 : 0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[isNCDHW ? 1 : 4]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = dst->stridesOf()[isNCDHW ? 2 : 1]; - user_dst_md->data.layout_desc.blocking.strides[0][3] = dst->stridesOf()[isNCDHW ? 3 : 2]; - user_dst_md->data.layout_desc.blocking.strides[0][4] = dst->stridesOf()[isNCDHW ? 4 : 3]; - } -} -#endif ////////////////////////////////////////////////////////////////////////// // [bS, iC, iD, iH, iW] is convoluted to [bS, iC, kD, kH, kW, oD, oH, oW] @@ -348,174 +233,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescPool3d( } -#ifdef HAVE_MKLDNN - using namespace mkldnn; - -void ConvolutionUtils::getMKLDNNMemoryDescConv2d( - int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, bool isSameMode, bool isNCHW, - int bS, int iC, int iH, int iW, int oC, int oH, int oW, const NDArray* src, const NDArray* diff_src, - const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, - mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, - mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, - mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r) { - mkldnn::memory::dims conv_src_tz = { bS, iC, iH, iW }; - mkldnn::memory::dims conv_weights_tz = { oC, iC, kH, kW }; - mkldnn::memory::dims conv_bias_tz = { oC }; - mkldnn::memory::dims conv_dst_tz = { bS, oC, oH, oW }; - - conv_strides = { sH, sW }; - conv_padding = { pH, pW }; - conv_padding_r = { (oH - 1) * sH - iH + kH - pH, - (oW - 1) * sW - iW + kW - pW }; - - auto type = mkldnn::memory::data_type::f32; - auto format = isNCHW ? mkldnn::memory::format::nchw : mkldnn::memory::format::nhwc; - auto formatw = mkldnn::memory::format::hwio; - - if (src != nullptr && conv_src_md != nullptr) { - *conv_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format::any); - *user_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[isNCHW ? 0 : 0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[isNCHW ? 1 : 3]; - user_src_md->data.layout_desc.blocking.strides[0][2] = src->stridesOf()[isNCHW ? 2 : 1]; - user_src_md->data.layout_desc.blocking.strides[0][3] = src->stridesOf()[isNCHW ? 3 : 2]; - } - - if (diff_src != nullptr && conv_diff_src_md != nullptr) { - *conv_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format::any); - *user_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[isNCHW ? 0 : 0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[isNCHW ? 1 : 3]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = diff_src->stridesOf()[isNCHW ? 2 : 1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = diff_src->stridesOf()[isNCHW ? 3 : 2]; - } - - if (weights != nullptr && conv_weights_md != nullptr) { - *conv_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format::any); - *user_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); - user_weights_md->data.format = mkldnn_blocked; // overrides "formatw = hwio" - user_weights_md->data.layout_desc.blocking.strides[0][0] = weights->stridesOf()[3]; - user_weights_md->data.layout_desc.blocking.strides[0][1] = weights->stridesOf()[2]; - user_weights_md->data.layout_desc.blocking.strides[0][2] = weights->stridesOf()[0]; - user_weights_md->data.layout_desc.blocking.strides[0][3] = weights->stridesOf()[1]; - } - - if (diff_weights != nullptr && conv_diff_weights_md != nullptr) { - *conv_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format::any); - *user_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); - user_diff_weights_md->data.format = mkldnn_blocked; // overrides "formatw = hwio" - user_diff_weights_md->data.layout_desc.blocking.strides[0][0] = diff_weights->stridesOf()[3]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][1] = diff_weights->stridesOf()[2]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][2] = diff_weights->stridesOf()[0]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][3] = diff_weights->stridesOf()[1]; - } - - if (bias != nullptr && conv_bias_md != nullptr) { - *conv_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format::any); - *user_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format::x); - } - - if (dst != nullptr && conv_dst_md != nullptr) { - *conv_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, mkldnn::memory::format::any); - *user_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[isNCHW ? 0 : 0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[isNCHW ? 1 : 3]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = dst->stridesOf()[isNCHW ? 2 : 1]; - user_dst_md->data.layout_desc.blocking.strides[0][3] = dst->stridesOf()[isNCHW ? 3 : 2]; - } -} - -void ConvolutionUtils::getMKLDNNMemoryDescConv3d( - int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, bool isSameMode, bool isNCDHW, - int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, const NDArray* src, const NDArray* diff_src, - const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, - mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, - mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, - mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, - mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r) { - mkldnn::memory::dims conv_src_tz = { bS, iC, iD, iH, iW }; - mkldnn::memory::dims conv_weights_tz = { oC, iC, kD, kH, kW }; - mkldnn::memory::dims conv_bias_tz = { oC }; - mkldnn::memory::dims conv_dst_tz = { bS, oC, oD, oH, oW }; - - conv_strides = { sD, sH, sW }; - conv_padding = { pD, pH, pW }; - conv_padding_r = { (oD - 1) * sD - iD + kD - pD, - (oH - 1) * sH - iH + kH - pH, - (oW - 1) * sW - iW + kW - pW }; - - auto type = mkldnn::memory::data_type::f32; - auto format = isNCDHW ? mkldnn::memory::format::ncdhw : mkldnn::memory::format::ndhwc; - auto formatw = mkldnn::memory::format::dhwio; - - if (src != nullptr && conv_src_md != nullptr) { - *conv_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format::any); - *user_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[isNCDHW ? 0 : 0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[isNCDHW ? 1 : 4]; - user_src_md->data.layout_desc.blocking.strides[0][2] = src->stridesOf()[isNCDHW ? 2 : 1]; - user_src_md->data.layout_desc.blocking.strides[0][3] = src->stridesOf()[isNCDHW ? 3 : 2]; - user_src_md->data.layout_desc.blocking.strides[0][4] = src->stridesOf()[isNCDHW ? 4 : 3]; - } - - if (diff_src != nullptr && conv_diff_src_md != nullptr) { - *conv_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format::any); - *user_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[isNCDHW ? 0 : 0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[isNCDHW ? 1 : 4]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = diff_src->stridesOf()[isNCDHW ? 2 : 1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = diff_src->stridesOf()[isNCDHW ? 3 : 2]; - user_diff_src_md->data.layout_desc.blocking.strides[0][4] = diff_src->stridesOf()[isNCDHW ? 4 : 3]; - } - - if (weights != nullptr && conv_weights_md != nullptr) { - *conv_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format::any); - *user_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); - user_weights_md->data.format = mkldnn_blocked; // overrides "formatw = dhwio" - user_weights_md->data.layout_desc.blocking.strides[0][0] = weights->stridesOf()[4]; - user_weights_md->data.layout_desc.blocking.strides[0][1] = weights->stridesOf()[3]; - user_weights_md->data.layout_desc.blocking.strides[0][2] = weights->stridesOf()[0]; - user_weights_md->data.layout_desc.blocking.strides[0][3] = weights->stridesOf()[1]; - user_weights_md->data.layout_desc.blocking.strides[0][4] = weights->stridesOf()[2]; - } - - if (diff_weights != nullptr && conv_diff_weights_md != nullptr) { - *conv_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format::any); - *user_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); - user_diff_weights_md->data.format = mkldnn_blocked; // overrides "formatw = dhwio" - user_diff_weights_md->data.layout_desc.blocking.strides[0][0] = diff_weights->stridesOf()[4]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][1] = diff_weights->stridesOf()[3]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][2] = diff_weights->stridesOf()[0]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][3] = diff_weights->stridesOf()[1]; - user_diff_weights_md->data.layout_desc.blocking.strides[0][4] = diff_weights->stridesOf()[2]; - } - - if (bias != nullptr && conv_bias_md != nullptr) { - *conv_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format::any); - *user_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format::x); - } - - if (dst != nullptr && conv_dst_md != nullptr) { - *conv_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, mkldnn::memory::format::any); - *user_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[isNCDHW ? 0 : 0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[isNCDHW ? 1 : 4]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = dst->stridesOf()[isNCDHW ? 2 : 1]; - user_dst_md->data.layout_desc.blocking.strides[0][3] = dst->stridesOf()[isNCDHW ? 3 : 2]; - user_dst_md->data.layout_desc.blocking.strides[0][4] = dst->stridesOf()[isNCDHW ? 4 : 3]; - } -} -#endif - ////////////////////////////////////////////////////////////////////////// template static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) { @@ -543,83 +260,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(isSameMode) // SAME ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("conv2d")); - } - - if (streams[0].checkAndReset({input, weights, bias}, {output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc conv_src_md(empty), conv_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_weights_md(empty), user_bias_md(empty), user_dst_md(empty); - mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; - - ConvolutionUtils::getMKLDNNMemoryDescConv2d(kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW, - bS, iC, iH, iW, oC, oH, oW, input, nullptr, weights, nullptr, bias, output, - &conv_src_md, nullptr, &conv_weights_md, nullptr, &conv_bias_md, &conv_dst_md, - &user_src_md, nullptr, &user_weights_md, nullptr, &user_bias_md, &user_dst_md, - conv_strides, conv_padding, conv_padding_r); - - auto conv_desc = bias != nullptr - ? convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input)->buffer()); - auto user_weights_memory = mkldnn::memory({user_weights_md, engine}, const_cast(weights)->buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output->buffer()); - - auto conv_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - conv_src_memory = mkldnn::memory(conv_prim_desc.src_primitive_desc()); - streams[0].addMemory(conv_src_memory); - streams[0].addOperation(reorder(user_src_memory, conv_src_memory)); - } - - auto conv_weights_memory = user_weights_memory; - streams[0].addMemory(user_weights_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.weights_primitive_desc()) - != user_weights_memory.get_primitive_desc()) { - conv_weights_memory = mkldnn::memory(conv_prim_desc.weights_primitive_desc()); - streams[0].addMemory(conv_weights_memory); - streams[0].addOperation(reorder(user_weights_memory, conv_weights_memory)); - } - - auto conv_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(conv_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - conv_dst_memory = mkldnn::memory(conv_prim_desc.dst_primitive_desc()); - streams[0].addMemory(conv_dst_memory); - } - - if (bias != nullptr) { - auto conv_bias_memory = mkldnn::memory(conv_prim_desc.bias_primitive_desc(), const_cast(bias)->buffer()); - streams[0].addMemory(conv_bias_memory); - streams[0].addOperation(convolution_forward(conv_prim_desc, conv_src_memory, conv_weights_memory, conv_bias_memory, conv_dst_memory)); - } else { - streams[0].addOperation(convolution_forward(conv_prim_desc, conv_src_memory, conv_weights_memory, conv_dst_memory)); - } - - if (mkldnn::memory::primitive_desc(conv_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(conv_dst_memory, user_dst_memory)); - } - } - - streams[0].submitAndWait(); - return; - } -#endif nd4j_debug("MKL-DNN is not used for conv2d!\n", 0); std::vector permutForOutput; @@ -686,151 +326,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(isSameMode) // SAME ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("conv2d_bp_weights")); - streams.push_back(MKLDNNStream("conv2d_bp_data")); - } - - bool resetW = streams[0].checkAndReset({input, weights, bias, gradO}, {gradI, gradW, gradB}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW}); - bool resetI = streams[1].checkAndReset({input, weights, bias, gradO}, {gradI, gradW, gradB}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW}); - if (resetW || resetI) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc conv_src_md(empty), conv_diff_src_md(empty), conv_weights_md(empty), - conv_diff_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_weights_md(empty), - user_diff_weights_md(empty), user_bias_md(empty), user_dst_md(empty); - mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; - - ConvolutionUtils::getMKLDNNMemoryDescConv2d(kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW, - bS, iC, iH, iW, oC, oH, oW, input, gradI, weights, gradW, gradB, gradO, - &conv_src_md, &conv_diff_src_md, &conv_weights_md, &conv_diff_weights_md, &conv_bias_md, &conv_dst_md, - &user_src_md, &user_diff_src_md, &user_weights_md, &user_diff_weights_md, &user_bias_md, &user_dst_md, - conv_strides, conv_padding, conv_padding_r); - - auto conv_desc = gradB != nullptr - ? convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward, - convolution_direct, conv_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, streams[0].getEngine()); - - if (gradW != nullptr) { - auto convW_desc = gradB != nullptr - ? convolution_backward_weights::desc( - convolution_direct, conv_src_md, conv_diff_weights_md, conv_bias_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero) - : convolution_backward_weights::desc( - convolution_direct, conv_src_md, conv_diff_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto convW_prim_desc = convolution_backward_weights::primitive_desc(convW_desc, engine, conv_prim_desc); - auto userW_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input)->buffer()); - auto userW_weights_memory = mkldnn::memory({user_diff_weights_md, engine}, gradW->buffer()); - auto userW_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO)->buffer()); - - auto convW_src_memory = userW_src_memory; - streams[0].addMemory(userW_src_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.src_primitive_desc()) - != userW_src_memory.get_primitive_desc()) { - convW_src_memory = mkldnn::memory(convW_prim_desc.src_primitive_desc()); - streams[0].addMemory(convW_src_memory); - streams[0].addOperation(reorder(userW_src_memory, convW_src_memory)); - } - - auto convW_weights_memory = userW_weights_memory; - streams[0].addMemory(userW_weights_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_weights_primitive_desc()) - != userW_weights_memory.get_primitive_desc()) { - convW_weights_memory = mkldnn::memory(convW_prim_desc.diff_weights_primitive_desc()); - streams[0].addMemory(convW_weights_memory); - } - - auto convW_dst_memory = userW_dst_memory; - streams[0].addMemory(userW_dst_memory); - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_dst_primitive_desc()) - != userW_dst_memory.get_primitive_desc()) { - convW_dst_memory = mkldnn::memory(convW_prim_desc.diff_dst_primitive_desc()); - streams[0].addMemory(convW_dst_memory); - streams[0].addOperation(reorder(userW_dst_memory, convW_dst_memory)); - } - - if (gradB != nullptr) { - auto convW_bias_memory = mkldnn::memory(convW_prim_desc.diff_bias_primitive_desc(), gradB->buffer()); - streams[0].addMemory(convW_bias_memory); - streams[0].addOperation(convolution_backward_weights(convW_prim_desc, convW_src_memory, convW_dst_memory, convW_weights_memory, convW_bias_memory)); - } else { - streams[0].addOperation(convolution_backward_weights(convW_prim_desc, convW_src_memory, convW_dst_memory, convW_weights_memory)); - } - - if (mkldnn::memory::primitive_desc(convW_prim_desc.diff_weights_primitive_desc()) - != userW_weights_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(convW_weights_memory, userW_weights_memory)); - } - } - - if (gradI != nullptr) { - auto convI_desc = - convolution_backward_data::desc( - convolution_direct, conv_diff_src_md, conv_weights_md, - conv_dst_md, conv_strides, conv_padding, conv_padding_r, padding_kind::zero); - - auto engine = streams[1].getEngine(); - auto convI_prim_desc = convolution_backward_data::primitive_desc(convI_desc, engine, conv_prim_desc); - auto userI_src_memory = mkldnn::memory({user_diff_src_md, engine}, gradI->buffer()); - auto userI_weights_memory = mkldnn::memory({user_weights_md, engine}, const_cast(weights)->buffer()); - auto userI_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO)->buffer()); - - auto convI_src_memory = userI_src_memory; - streams[1].addMemory(userI_src_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_src_primitive_desc()) - != userI_src_memory.get_primitive_desc()) { - convI_src_memory = mkldnn::memory(convI_prim_desc.diff_src_primitive_desc()); - streams[1].addMemory(convI_src_memory); - } - - auto convI_weights_memory = userI_weights_memory; - streams[1].addMemory(userI_weights_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.weights_primitive_desc()) - != userI_weights_memory.get_primitive_desc()) { - convI_weights_memory = mkldnn::memory(convI_prim_desc.weights_primitive_desc()); - streams[1].addMemory(convI_weights_memory); - streams[1].addOperation(reorder(userI_weights_memory, convI_weights_memory)); - } - - auto convI_dst_memory = userI_dst_memory; - streams[1].addMemory(userI_dst_memory); - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_dst_primitive_desc()) - != userI_dst_memory.get_primitive_desc()) { - convI_dst_memory = mkldnn::memory(convI_prim_desc.diff_dst_primitive_desc()); - streams[1].addMemory(convI_dst_memory); - streams[1].addOperation(reorder(userI_dst_memory, convI_dst_memory)); - } - - streams[1].addOperation(convolution_backward_data(convI_prim_desc, convI_dst_memory, convI_weights_memory, convI_src_memory)); - - if (mkldnn::memory::primitive_desc(convI_prim_desc.diff_src_primitive_desc()) - != userI_src_memory.get_primitive_desc()) { - streams[1].addOperation(reorder(convI_src_memory, userI_src_memory)); - } - } - } - - if (gradW != nullptr) { - streams[0].submitAndWait(); - } - if (gradI != nullptr) { - streams[1].submitAndWait(); - } - return; - } -#endif nd4j_debug("MKL-DNN is not used for conv2d_bp!\n", 0); std::vector gradOaxesForDot; @@ -1268,62 +763,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( const int oH = output.sizeAt(2); const int oW = output.sizeAt(3); -#ifdef HAVE_MKLDNN - if (poolingMode < 2 && block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("pooling2d")); - } - - if (streams[0].checkAndReset({&input}, {&output}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); - mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; - mkldnn::algorithm algorithm; - - ConvolutionUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, true, - bS, iC, iH, iW, oC, oH, oW, &input, nullptr, &output, algorithm, - &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, &user_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r); - - auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, pool_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input).buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output.buffer()); - - auto pool_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - pool_src_memory = mkldnn::memory(pool_prim_desc.src_primitive_desc()); - streams[0].addMemory(pool_src_memory); - streams[0].addOperation(reorder(user_src_memory, pool_src_memory)); - } - - auto pool_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_primitive_desc()); - streams[0].addMemory(pool_dst_memory); - } - - streams[0].addOperation(pooling_forward(pool_prim_desc, pool_src_memory, pool_dst_memory)); - - if (mkldnn::memory::primitive_desc(pool_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(pool_dst_memory, user_dst_memory)); - } - } - - streams[0].submitAndWait(); - return; - } -#endif nd4j_debug("MKL-DNN is not used for pooling2d!\n", 0); const Nd4jLong iStride0 = input.stridesOf()[0]; @@ -1504,62 +943,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( const int oH = output.sizeAt(3); const int oW = output.sizeAt(4); -#ifdef HAVE_MKLDNN - if (poolingMode < 2 && block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("pooling3d")); - } - - if (streams[0].checkAndReset({&input}, {&output}, {}, {kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); - mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; - mkldnn::algorithm algorithm; - - ConvolutionUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0, true, - bS, iC, iD, iH, iW, oC, oD, oH, oW, &input, nullptr, &output, algorithm, - &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, &user_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r); - - auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, pool_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input).buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output.buffer()); - - auto pool_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - pool_src_memory = mkldnn::memory(pool_prim_desc.src_primitive_desc()); - streams[0].addMemory(pool_src_memory); - streams[0].addOperation(reorder(user_src_memory, pool_src_memory)); - } - - auto pool_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_primitive_desc()); - streams[0].addMemory(pool_dst_memory); - } - - streams[0].addOperation(pooling_forward(pool_prim_desc, pool_src_memory, pool_dst_memory)); - - if (mkldnn::memory::primitive_desc(pool_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(pool_dst_memory, user_dst_memory)); - } - } - - streams[0].submitAndWait(); - return; - } -#endif nd4j_debug("MKL-DNN is not used for pooling3d!\n", 0); const Nd4jLong iStride0 = input.stridesOf()[0]; @@ -1776,91 +1159,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( const int oH = gradO.sizeAt(2); const int oW = gradO.sizeAt(3); -#ifdef HAVE_MKLDNN - if (poolingMode < 2 && block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("pooling2d_bp")); - } - - if (streams[0].checkAndReset({&input, &gradO}, {&gradI}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); - mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; - mkldnn::algorithm algorithm; - - ConvolutionUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, true, - bS, iC, iH, iW, oC, oH, oW, &input, &gradI, &gradO, algorithm, - &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, &user_diff_src_md, &user_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r); - - // input is sometimes null, so we can't rely on pool_src_md being valid - auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, - const_cast(input).buffer() != nullptr ? pool_src_md : pool_diff_src_md, - pool_dst_md, pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); - - auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); - auto userB_src_memory = mkldnn::memory({user_src_md, engine}, gradI.buffer()); - auto userB_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO).buffer()); - - auto poolB_src_memory = userB_src_memory; - streams[0].addMemory(userB_src_memory); - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_src_primitive_desc()) - != userB_src_memory.get_primitive_desc()) { - poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_primitive_desc()); - streams[0].addMemory(poolB_src_memory); - } - - auto poolB_dst_memory = userB_dst_memory; - streams[0].addMemory(userB_dst_memory); - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_dst_primitive_desc()) - != userB_dst_memory.get_primitive_desc()) { - poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_primitive_desc()); - streams[0].addMemory(poolB_dst_memory); - streams[0].addOperation(reorder(userB_dst_memory, poolB_dst_memory)); - } - - if (algorithm == mkldnn::pooling_max) { - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input).buffer()); - - auto pool_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - pool_src_memory = mkldnn::memory(pool_prim_desc.src_primitive_desc()); - streams[0].addMemory(pool_src_memory); - streams[0].addOperation(reorder(user_src_memory, pool_src_memory)); - } - - auto pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_primitive_desc()); - streams[0].addMemory(pool_dst_memory); - - auto pool_workspace_memory = mkldnn::memory(pool_prim_desc.workspace_primitive_desc()); - streams[0].addMemory(pool_workspace_memory); - - streams[0].addOperation(pooling_forward(pool_prim_desc, pool_src_memory, pool_dst_memory, pool_workspace_memory)); - streams[0].addOperation(pooling_backward(poolB_prim_desc, poolB_dst_memory, pool_workspace_memory, poolB_src_memory)); - } else { - streams[0].addOperation(pooling_backward(poolB_prim_desc, poolB_dst_memory, poolB_src_memory)); - } - - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_src_primitive_desc()) - != userB_src_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(poolB_src_memory, userB_src_memory)); - } - } - - streams[0].submitAndWait(); - return; - } -#endif nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0); const Nd4jLong iStride0 = input.stridesOf()[0]; @@ -2099,94 +1397,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( const int oH = gradO.sizeAt(3); const int oW = gradO.sizeAt(4); -#ifdef HAVE_MKLDNN - if (poolingMode < 2 && block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported()) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("pooling3d_bp")); - } - - if (streams[0].checkAndReset({&input, &gradO}, {&gradI}, {}, {kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); - mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); - mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; - mkldnn::algorithm algorithm; - - ConvolutionUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0, true, - bS, iC, iD, iH, iW, oC, oD, oH, oW, &input, &gradI, &gradO, algorithm, - &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, &user_diff_src_md, &user_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r); - - // input is sometimes null, so we can't rely on pool_src_md being valid - if (const_cast(input).buffer() == nullptr) { - pool_src_md = pool_diff_src_md; - user_src_md = user_diff_src_md; - } - auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, pool_src_md, - pool_dst_md, pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto engine = streams[0].getEngine(); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); - - auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, - pool_strides, pool_kernel, pool_padding, pool_padding_r, padding_kind::zero); - - auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); - auto userB_src_memory = mkldnn::memory({user_diff_src_md, engine}, gradI.buffer()); - auto userB_dst_memory = mkldnn::memory({user_dst_md, engine}, const_cast(gradO).buffer()); - - auto poolB_src_memory = userB_src_memory; - streams[0].addMemory(userB_src_memory); - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_src_primitive_desc()) - != userB_src_memory.get_primitive_desc()) { - poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_primitive_desc()); - streams[0].addMemory(poolB_src_memory); - } - - auto poolB_dst_memory = userB_dst_memory; - streams[0].addMemory(userB_dst_memory); - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_dst_primitive_desc()) - != userB_dst_memory.get_primitive_desc()) { - poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_primitive_desc()); - streams[0].addMemory(poolB_dst_memory); - streams[0].addOperation(reorder(userB_dst_memory, poolB_dst_memory)); - } - - if (algorithm == mkldnn::pooling_max) { - auto user_src_memory = mkldnn::memory({user_src_md, engine}, const_cast(input).buffer()); - - auto pool_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(pool_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - pool_src_memory = mkldnn::memory(pool_prim_desc.src_primitive_desc()); - streams[0].addMemory(pool_src_memory); - streams[0].addOperation(reorder(user_src_memory, pool_src_memory)); - } - - auto pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_primitive_desc()); - streams[0].addMemory(pool_dst_memory); - - auto pool_workspace_memory = mkldnn::memory(pool_prim_desc.workspace_primitive_desc()); - streams[0].addMemory(pool_workspace_memory); - - streams[0].addOperation(pooling_forward(pool_prim_desc, pool_src_memory, pool_dst_memory, pool_workspace_memory)); - streams[0].addOperation(pooling_backward(poolB_prim_desc, poolB_dst_memory, pool_workspace_memory, poolB_src_memory)); - } else { - streams[0].addOperation(pooling_backward(poolB_prim_desc, poolB_dst_memory, poolB_src_memory)); - } - - if (mkldnn::memory::primitive_desc(poolB_prim_desc.diff_src_primitive_desc()) - != userB_src_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(poolB_src_memory, userB_src_memory)); - } - } - - streams[0].submitAndWait(); - return; - } -#endif nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0); const Nd4jLong iStride0 = input.stridesOf()[0]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp index a02d5918c..0d0705104 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp @@ -27,107 +27,9 @@ namespace nd4j { namespace ops { namespace helpers { -#ifdef HAVE_MKLDNN -using namespace mkldnn; - -static void getMKLDNNMemoryDescLrn(const NDArray* src, const NDArray* diff_src, const NDArray* dst, - mkldnn::memory::desc* lrn_src_md, mkldnn::memory::desc* lrn_diff_src_md, mkldnn::memory::desc* lrn_dst_md, - mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis) { - const Nd4jLong* shape = src->getShapeInfo(); - long rank = shape[0]; - long dim1 = axis; // MKL-DNN supports only 1 axis, which has to be the "channel" one - long dim2 = axis >= 2 ? 1 : 2; - long dim3 = axis >= 3 ? 2 : 3; - mkldnn::memory::dims lrn_src_tz = { (int)shape[1], (int)shape[dim1 + 1], rank > 2 ? (int)shape[dim2 + 1] : 1, rank > 3 ? (int)shape[dim3 + 1] : 1}; - - auto type = mkldnn::memory::data_type::f32; - auto format = axis == 1 ? mkldnn::memory::format::nchw : mkldnn::memory::format::nhwc; - auto supposed_to_be_any_format = format; // doesn't work with "any" - - if (src != nullptr && src->getBuffer() != nullptr && lrn_src_md != nullptr) { - *lrn_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); - *user_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); - user_src_md->data.format = mkldnn_blocked; - user_src_md->data.layout_desc.blocking.strides[0][0] = src->stridesOf()[0]; - user_src_md->data.layout_desc.blocking.strides[0][1] = src->stridesOf()[dim1]; - user_src_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? src->stridesOf()[dim2] : 1; - user_src_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? src->stridesOf()[dim3] : 1; - } - - if (diff_src != nullptr && diff_src->getBuffer() != nullptr && lrn_diff_src_md != nullptr) { - *lrn_diff_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); - *user_diff_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); - user_diff_src_md->data.format = mkldnn_blocked; - user_diff_src_md->data.layout_desc.blocking.strides[0][0] = diff_src->stridesOf()[0]; - user_diff_src_md->data.layout_desc.blocking.strides[0][1] = diff_src->stridesOf()[dim1]; - user_diff_src_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? diff_src->stridesOf()[dim2] : 1; - user_diff_src_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? diff_src->stridesOf()[dim3] : 1; - } - - if (dst != nullptr && dst->getBuffer() != nullptr && lrn_dst_md != nullptr) { - *lrn_dst_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); - *user_dst_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); - user_dst_md->data.format = mkldnn_blocked; - user_dst_md->data.layout_desc.blocking.strides[0][0] = dst->stridesOf()[0]; - user_dst_md->data.layout_desc.blocking.strides[0][1] = dst->stridesOf()[dim1]; - user_dst_md->data.layout_desc.blocking.strides[0][2] = rank > 2 ? dst->stridesOf()[dim2] : 1; - user_dst_md->data.layout_desc.blocking.strides[0][3] = rank > 3 ? dst->stridesOf()[dim3] : 1; - } -} -#endif - template static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, float bias, float alpha, float beta) { -#ifdef HAVE_MKLDNN - if (block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output})) { - std::vector& streams = block.getMKLDNNStreams(); - if (streams.empty()) { - streams.push_back(MKLDNNStream("lrn")); - } - - if (streams[0].checkAndReset({input}, {output}, {(float)bias, (float)alpha, (float)beta}, {depth})) { - mkldnn_memory_desc_t empty; - mkldnn::memory::desc lrn_src_md(empty), lrn_dst_md(empty), user_src_md(empty), user_dst_md(empty); - - getMKLDNNMemoryDescLrn(input, nullptr, output, &lrn_src_md, nullptr, &lrn_dst_md, &user_src_md, nullptr, &user_dst_md, input->rankOf() - 1); - - auto lrn_desc = lrn_forward::desc(prop_kind::forward_inference, lrn_across_channels, lrn_src_md, (2 * depth + 1), alpha * (2 * depth + 1), beta, bias); - - auto engine = streams[0].getEngine(); - auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, engine); - auto user_src_memory = mkldnn::memory({user_src_md, engine}, input->buffer()); - auto user_dst_memory = mkldnn::memory({user_dst_md, engine}, output->buffer()); - - auto lrn_src_memory = user_src_memory; - streams[0].addMemory(user_src_memory); - if (mkldnn::memory::primitive_desc(lrn_prim_desc.src_primitive_desc()) - != user_src_memory.get_primitive_desc()) { - lrn_src_memory = mkldnn::memory(lrn_prim_desc.src_primitive_desc()); - streams[0].addMemory(lrn_src_memory); - streams[0].addOperation(reorder(user_src_memory, lrn_src_memory)); - } - - auto lrn_dst_memory = user_dst_memory; - streams[0].addMemory(user_dst_memory); - if (mkldnn::memory::primitive_desc(lrn_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - lrn_dst_memory = mkldnn::memory(lrn_prim_desc.dst_primitive_desc()); - streams[0].addMemory(lrn_dst_memory); - } - - streams[0].addOperation(lrn_forward(lrn_prim_desc, lrn_src_memory, lrn_dst_memory)); - - if (mkldnn::memory::primitive_desc(lrn_prim_desc.dst_primitive_desc()) - != user_dst_memory.get_primitive_desc()) { - streams[0].addOperation(reorder(lrn_dst_memory, user_dst_memory)); - } - } - - streams[0].submitAndWait(); - return ND4J_STATUS_OK; - } -#endif nd4j_debug("MKL-DNN is not used for lrn!\n", 0); const int rank = input->rankOf(); diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp index b313acd9c..3dea41a18 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -501,7 +502,22 @@ namespace nd4j { prepTime = std::chrono::duration_cast(timeStart - timeEnter).count(); } - Nd4jStatus status = this->validateAndExecute(*block); + + Nd4jStatus status; + bool hasHelper = false; + + // if we have platform-specific helper for this op - invoke it + if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) { + auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash()); + if (helper->isUsable(*block)) { + status = helper->invokeHelper(*block); + hasHelper = true; + } + } + + // if we don't have platform-specific helper - invoke generic implementation + if (!hasHelper) + status = this->validateAndExecute(*block); // optionally saving execution time if (Environment::getInstance()->isProfiling()) { diff --git a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp index 2105ac32b..a42203162 100644 --- a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp +++ b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp @@ -113,8 +113,13 @@ namespace nd4j { for (auto x : _uniqueD) delete x; + for (auto x: _uniqueH) + delete x; + _uniqueD.clear(); + _uniqueH.clear(); + _declarablesD.clear(); _declarablesLD.clear(); @@ -144,6 +149,8 @@ namespace nd4j { return _opsList.c_str(); } + + bool OpRegistrator::registerOperation(const char* name, nd4j::ops::DeclarableOp* op) { std::string str(name); std::pair pair(str, op); @@ -165,6 +172,19 @@ namespace nd4j { return registerOperation(op->getOpName()->c_str(), op); } + void OpRegistrator::registerHelper(nd4j::ops::platforms::PlatformHelper* op) { + if (_helpersLH.count(op->hash()) > 0) + throw std::runtime_error("Tried to double register PlatformHelper"); + + _uniqueH.emplace_back(op); + + std::pair pair(op->name(), op); + _helpersH.insert(pair); + + std::pair pair2(op->hash(), op); + _helpersLH.insert(pair2); + } + nd4j::ops::DeclarableOp* OpRegistrator::getOperation(const char *name) { std::string str(name); return getOperation(str); @@ -207,6 +227,16 @@ namespace nd4j { return _declarablesD.at(name); } + nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash) { + if (_helpersLH.count(hash) == 0) + throw std::runtime_error("Requested helper can't be found"); + + return _helpersLH[hash]; + } + + bool OpRegistrator::hasHelper(Nd4jLong hash) { + return _helpersLH.count(hash) > 0; + } int OpRegistrator::numberOfOperations() { return (int) _declarablesLD.size(); diff --git a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp new file mode 100644 index 000000000..75dc6e2c4 --- /dev/null +++ b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp @@ -0,0 +1,86 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "../PlatformHelper.h" +#include + +namespace nd4j { + namespace ops { + namespace platforms { + PlatformHelper::PlatformHelper(const char *name) { + // we just store name/hash of target operation + _name = std::string(name); + _hash = HashHelper::getInstance()->getLongHash(_name); + } + + nd4j::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) { + NDArray *z = nullptr; + + if (ctx.isFastPath()) { + if (ctx.fastpath_out().size() <= inputId) { + if (ctx.isInplace()) { + z = ctx.fastpath_in()[inputId]; + } else + throw std::runtime_error("fastpath_out: unresolved output array"); + } else { + z = ctx.fastpath_out()[inputId]; + } + } else { + std::pair pair(ctx.nodeId(), inputId); + + if (ctx.isInplace()) { + z = ctx.variable(inputId)->getNDArray(); + + // hypothetically it's possible to have no variable. chances are low, but who knows. let's just create it for now + if (!ctx.getVariableSpace()->hasVariable(pair)) { + auto var = new graph::Variable(); + ctx.getVariableSpace()->putVariable(pair, var); + } + + // now we're saving input array as output array + auto var = ctx.getVariableSpace()->getVariable(pair); + var->markRemovable(false); + var->setNDArray(z); + } else if (!ctx.isInplace()) { + auto var = ctx.variable(pair); + if (var->getNDArray() != nullptr && var->getNDArray()->nonNull()) { + z = var->getNDArray(); + } else { + nd4j_printf("Can't get Z variable for node_%i!\n", ctx.nodeId()); + } + } else { + nd4j_printf("BOOM!\n", ""); + throw std::runtime_error("Boom!"); + } + } + + return z; + } + + std::string PlatformHelper::name() { + return _name; + } + + Nd4jLong PlatformHelper::hash() { + return _hash; + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/README.md b/libnd4j/include/ops/declarable/platform/README.md new file mode 100644 index 000000000..65547feca --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/README.md @@ -0,0 +1 @@ +This folder contains platform-specific optimized implementations for custom operations \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp new file mode 100644 index 000000000..927c53e40 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp @@ -0,0 +1,143 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(avgpool2d) { + auto input = INPUT_VARIABLE(0); + + REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead", + input->rankOf()); + + // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode; + auto argI = *(block.getIArguments()); + auto output = OUTPUT_VARIABLE(0); + + const auto kH = INT_ARG(0); + const auto kW = INT_ARG(1); + const auto sH = INT_ARG(2); + const auto sW = INT_ARG(3); + int pH = INT_ARG(4); + int pW = INT_ARG(5); + const auto dH = INT_ARG(6); + const auto dW = INT_ARG(7); + const auto isSameMode = static_cast(INT_ARG(8)); + const auto extraParam0 = INT_ARG(9); + + REQUIRE_TRUE(dH != 0 && dW != 0, 0, "AVGPOOL2D op: dilation must not be zero, but got instead {%i, %i}", + dH, dW); + + int oH = 0; + int oW = 0; + + int isNCHW = block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1; // INT_ARG(10): 0-NCHW, 1-NHWC + + const int iH = static_cast(isNCHW ? input->sizeAt(2) : input->sizeAt(1)); + const int iW = static_cast(isNCHW ? input->sizeAt(3) : input->sizeAt(2)); + + if (!isNCHW) { + input = new NDArray( + input->permute({0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + output = new NDArray( + output->permute({0, 3, 1, 2})); // [bS, oH, oW, iC] -> [bS, iC, oH, oW] + } + + ConvolutionUtils::calcOutSizePool2D(oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode); + + if (isSameMode) + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + const int bS = input->sizeAt(0); + const int iC = input->sizeAt(1); + const int oC = output->sizeAt(1); + + auto poolingMode = PoolingType::AVG_POOL; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + mkldnnUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, + true, + bS, iC, iH, iW, oC, oH, oW, input, nullptr, output, + algorithm, + &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, + &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, + pool_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + auto pool_src_memory = user_src_memory; + mkldnn::stream stream(engine); + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + auto pool_dst_memory = user_dst_memory; + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + } + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}}); + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(pool_dst_memory, user_dst_memory).execute(stream, pool_dst_memory, user_dst_memory); + } + stream.wait(); + + //streams[0].submitAndWait(); + + if (!isNCHW) { + delete input; + delete output; + } + + return Status::OK(); + } + + PLATFORM_CHECK(avgpool2d) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp new file mode 100644 index 000000000..bf7b11b70 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d_bp.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(avgpool2d_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) + auto gradO = INPUT_VARIABLE( + 1); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon + + int kH = INT_ARG(0); // filter(kernel) height + int kW = INT_ARG(1); // filter(kernel) width + int sH = INT_ARG(2); // strides height + int sW = INT_ARG(3); // strides width + int pH = INT_ARG(4); // paddings height + int pW = INT_ARG(5); // paddings width + int dH = INT_ARG(6); // dilations height + int dW = INT_ARG(7); // dilations width + int isSameMode = INT_ARG(8); // 0-VALID, 1-SAME + int extraParam0 = INT_ARG(9); + int isNCHW = + block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1; // INT_ARG(10): 0-NCHW, 1-NHWC + + REQUIRE_TRUE(input->rankOf() == 4, 0, + "AVGPOOL2D_BP op: input should have rank of 4, but got %i instead", input->rankOf()); + REQUIRE_TRUE(dH != 0 && dW != 0, 0, + "AVGPOOL2D_BP op: dilation must not be zero, but got instead {%i, %i}", dH, dW); + + int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width; + int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, + indIiH, indWiC, indWoC, indWkH, indOoH); + + std::string expectedGradOShape = ShapeUtils::shapeAsString( + ShapeUtils::composeShapeUsingDimsAndIdx({bS, iC, oH, oW, 0, indIOioC, indIiH, indIiH + 1})); + std::string expectedGradIShape = ShapeUtils::shapeAsString( + ShapeUtils::composeShapeUsingDimsAndIdx({bS, iC, iH, iW, 0, indIOioC, indIiH, indIiH + 1})); + REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, + "AVGPOOL2D_BP op: wrong shape of output's gradients array (next epsilon), expected is %s, but got %s instead !", + expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str()); + REQUIRE_TRUE(expectedGradIShape == ShapeUtils::shapeAsString(gradI), 0, + "AVGPOOL2D_BP op: wrong shape of input's gradients array (epsilon), expected is %s, but got %s instead !", + expectedGradIShape.c_str(), ShapeUtils::shapeAsString(gradI).c_str()); + + + if (!isNCHW) { + input = new NDArray(input->permute( + {0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + gradI = new NDArray(gradI->permute( + {0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + gradO = new NDArray(gradO->permute( + {0, 3, 1, 2})); // [bS, oH, oW, iC] -> [bS, iC, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + auto poolingMode = PoolingType::AVG_POOL; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + mkldnnUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, + true, + bS, iC, iH, iW, oC, oH, oW, input, gradI, gradO, algorithm, + &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, + &user_diff_src_md, &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, + input->buffer() != nullptr ? pool_src_md : pool_diff_src_md, + pool_dst_md, pool_strides, pool_kernel, pool_padding, + pool_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, pool_strides, + pool_kernel, pool_padding, pool_padding_r); + auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); + auto userB_src_memory = mkldnn::memory(user_src_md, engine, gradI->buffer()); + auto userB_dst_memory = mkldnn::memory(user_dst_md, engine, gradO->buffer()); + auto poolB_src_memory = userB_src_memory; + mkldnn::stream stream(engine); + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_desc(), engine); + } + auto poolB_dst_memory = userB_dst_memory; + if (poolB_prim_desc.diff_dst_desc() != userB_dst_memory.get_desc()) { + poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_desc(), engine); + reorder(userB_dst_memory, poolB_dst_memory).execute(stream, userB_dst_memory, poolB_dst_memory); + } + pooling_backward(poolB_prim_desc).execute(stream, {{MKLDNN_ARG_DIFF_DST, poolB_dst_memory}, + {MKLDNN_ARG_DIFF_SRC, poolB_src_memory}}); + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + reorder(poolB_src_memory, userB_src_memory).execute(stream, poolB_src_memory, userB_src_memory); + } + stream.wait(); + + if (!isNCHW) { + delete input; + delete gradI; + delete gradO; + } + + + return Status::OK(); + } + + PLATFORM_CHECK(avgpool2d_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp new file mode 100644 index 000000000..74fe7c8de --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(avgpool3dnew) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto output = OUTPUT_VARIABLE( + 0); // [bS, oD, oH, oW, iC] (NDHWC) or [bS, iC, oD, oH, oW] (NCDHW) + + int kD = INT_ARG(0); // filter(kernel) depth + int kH = INT_ARG(1); // filter(kernel) height + int kW = INT_ARG(2); // filter(kernel) width + int sD = INT_ARG(3); // strides depth + int sH = INT_ARG(4); // strides height + int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + int dD = INT_ARG(9); // dilations depth + int dH = INT_ARG(10); // dilations height + int dW = INT_ARG(11); // dilations width + int isSameMode = INT_ARG(12); // 1-SAME, 0-VALID + int extraParam0 = INT_ARG(13); // unnecessary for max case, required only for avg and pnorm cases + int isNCDHW = block.getIArguments()->size() > 14 ? !INT_ARG(14) : 1; // 1-NDHWC, 0-NCDHW + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "MAXPOOL3DNEW OP: rank of input array must be equal to 5, but got %i instead !", + input->rankOf()); + REQUIRE_TRUE(dD != 0 && dH != 0 && dW != 0, 0, + "MAXPOOL3DNEW op: dilation must not be zero, but got instead {%i, %i, %i}", dD, dH, dW); + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + std::string expectedOutputShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, oD, oH, oW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + REQUIRE_TRUE(expectedOutputShape == ShapeUtils::shapeAsString(output), 0, + "MAXPOOL3D op: wrong shape of output array, expected is %s, but got %s instead !", + expectedOutputShape.c_str(), ShapeUtils::shapeAsString(output).c_str()); + // REQUIRE_TRUE(iD >= kD && iH >= kH && iW >= kW, 0, "MAXPOOL3D OP: the input depth/height/width must be greater or equal to kernel(filter) depth/height/width, but got [%i, %i, %i] and [%i, %i, %i] correspondingly !", iD,iH,iW, kD,kH,kW); + // REQUIRE_TRUE(kD/2 >= pD && kH/2 >= pH && kW/2 >= pW, 0, "MAXPOOL3D OP: pad depth/height/width must not be greater than half of kernel depth/height/width, but got [%i, %i, %i] and [%i, %i, %i] correspondingly !", pD,pH,pW, kD,kH,kW); + + if (!isNCDHW) { + input = new NDArray( + input->permute({0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + output = new NDArray( + output->permute({0, 4, 1, 2, 3})); // [bS, oD, oH, oW, iC] -> [bS, iC, oD, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); + + + auto poolingMode = PoolingType::AVG_POOL; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + mkldnnUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, + extraParam0, true, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, nullptr, output, + algorithm, + &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, + &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, + pool_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + auto pool_src_memory = user_src_memory; + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + auto pool_dst_memory = user_dst_memory; + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + } + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}}); + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(pool_dst_memory, user_dst_memory).execute(stream, pool_dst_memory, user_dst_memory); + } + stream.wait(); + + if (!isNCDHW) { + delete input; + delete output; + } + + return Status::OK(); + } + + PLATFORM_CHECK(avgpool3dnew) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp new file mode 100644 index 000000000..cc57e671d --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d_bp.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(avgpool3dnew_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto gradO = INPUT_VARIABLE( + 1); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon + + const int kD = INT_ARG(0); // filter(kernel) depth + const int kH = INT_ARG(1); // filter(kernel) height + const int kW = INT_ARG(2); // filter(kernel) width + const int sD = INT_ARG(3); // strides depth + const int sH = INT_ARG(4); // strides height + const int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + const int dD = INT_ARG(9); // dilations depth + const int dH = INT_ARG(10); // dilations height + const int dW = INT_ARG(11); // dilations width + const int isSameMode = INT_ARG(12); // 1-SAME, 0-VALID + int extraParam0 = INT_ARG(13); // unnecessary for max case, required only for avg and pnorm cases + int isNCDHW = block.getIArguments()->size() > 14 ? !INT_ARG(14) : 1; // 1-NDHWC, 0-NCDHW + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "MAXPOOL3D_BP op: input should have rank of 5, but got %i instead", input->rankOf()); + REQUIRE_TRUE(dD != 0 && dH != 0 && dW != 0, 0, + "MAXPOOL3DNEW op: dilation must not be zero, but got instead {%i, %i, %i}", dD, dH, dW); + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + std::string expectedGradOShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, oD, oH, oW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + std::string expectedGradIShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, iD, iH, iW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, + "MAXPOOL3D_BP op: wrong shape of output's gradients array (next epsilon), expected is %s, but got %s instead !", + expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str()); + REQUIRE_TRUE(expectedGradIShape == ShapeUtils::shapeAsString(gradI), 0, + "MAXPOOL3D_BP op: wrong shape of input's gradients array (epsilon), expected is %s, but got %s instead !", + expectedGradIShape.c_str(), ShapeUtils::shapeAsString(gradI).c_str()); + + if (!isNCDHW) { + input = new NDArray(input->permute( + {0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + gradI = new NDArray(gradI->permute( + {0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + gradO = new NDArray(gradO->permute( + {0, 4, 1, 2, 3})); // [bS, oD, oH, oW, iC] -> [bS, iC, oD, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); + + + + auto poolingMode = PoolingType::AVG_POOL; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + mkldnnUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, + extraParam0, true, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, gradI, gradO, + algorithm, + &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, + &user_diff_src_md, &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + if (input->buffer() == nullptr) { + pool_src_md = pool_diff_src_md; + user_src_md = user_diff_src_md; + } + auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, pool_src_md, pool_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, pool_strides, + pool_kernel, pool_padding, pool_padding_r); + auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); + auto userB_src_memory = mkldnn::memory(user_diff_src_md, engine, gradI->buffer()); + auto userB_dst_memory = mkldnn::memory(user_dst_md, engine, gradO->buffer()); + auto poolB_src_memory = userB_src_memory; + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_desc(), engine); + } + auto poolB_dst_memory = userB_dst_memory; + if (poolB_prim_desc.diff_dst_desc() != userB_dst_memory.get_desc()) { + poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_desc(), engine); + reorder(userB_dst_memory, poolB_dst_memory).execute(stream, userB_dst_memory, poolB_dst_memory); + } + pooling_backward(poolB_prim_desc).execute(stream, {{MKLDNN_ARG_DIFF_DST, poolB_dst_memory}, + {MKLDNN_ARG_DIFF_SRC, poolB_src_memory}}); + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + reorder(poolB_src_memory, userB_src_memory).execute(stream, poolB_src_memory, userB_src_memory); + } + stream.wait(); + + if (!isNCDHW) { + delete input; + delete gradI; + delete gradO; + } + + return Status::OK(); + } + + PLATFORM_CHECK(avgpool3dnew_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp new file mode 100644 index 000000000..4947a39c0 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp @@ -0,0 +1,166 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(batchnorm_new) { + auto input = INPUT_VARIABLE(0); + auto mean = INPUT_VARIABLE(1); + auto variance = INPUT_VARIABLE(2); + NDArray *gamma = nullptr; + NDArray *beta = nullptr; + + auto output = OUTPUT_VARIABLE(0); + + const bool applyScale = (bool) INT_ARG(0); + const bool applyOffset = (bool) INT_ARG(1); + const double epsilon = T_ARG(0); + + if (applyScale) + gamma = INPUT_VARIABLE(3); + if (applyOffset) + beta = INPUT_VARIABLE(3 + static_cast(applyScale)); + + std::vector axes; + if (block.numI() > 2) + for (int i = 2; i < block.numI(); ++i) + axes.push_back(INT_ARG(i)); + else + axes.push_back(input->rankOf() - 1); + + std::vector shape({2, mean->lengthOf()}); + NDArray weights = NDArrayFactory::create('c', shape, block.launchContext()); + weights({0, 1, 0, 0}).assign(1.0f); + weights({1, 2, 0, 0}).assign(0.0f); + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc batchnorm_src_md(empty), batchnorm_dst_md(empty), user_src_md( + empty), user_dst_md(empty); + + auto norm_flag = normalization_flags::use_global_stats; + if (applyScale || applyOffset) + norm_flag |= normalization_flags::use_scale_shift; + + mkldnnUtils::getMKLDNNMemoryDescBatchNorm(input, nullptr, output, + &batchnorm_src_md, nullptr, &batchnorm_dst_md, + &user_src_md, nullptr, &user_dst_md, axes[0]); + + auto batchnorm_desc = batch_normalization_forward::desc(prop_kind::forward_inference, batchnorm_src_md, epsilon, norm_flag); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto batchnorm_prim_desc = batch_normalization_forward::primitive_desc(batchnorm_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + auto batchnorm_mean_memory = mkldnn::memory(batchnorm_prim_desc.mean_desc(), engine, + mean->buffer()); + auto batchnorm_variance_memory = mkldnn::memory(batchnorm_prim_desc.variance_desc(), engine, + variance->buffer()); + auto batchnorm_src_memory = user_src_memory; + mkldnn::memory m(batchnorm_src_md, engine); + if (m.get_desc() != user_src_memory.get_desc()) { + batchnorm_src_memory = mkldnn::memory(batchnorm_src_md, engine); + reorder(user_src_memory, batchnorm_src_memory).execute(stream, user_src_memory, + batchnorm_src_memory); + } + auto batchnorm_dst_memory = user_dst_memory; + if (batchnorm_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + batchnorm_dst_memory = mkldnn::memory(batchnorm_prim_desc.dst_desc(), engine); + } + if (applyScale || applyOffset) { + if (gamma != nullptr) { + weights({0, 1, 0, 0}).assign(gamma); + } + if (beta != nullptr) { + weights({1, 2, 0, 0}).assign(beta); + } + + auto batchnorm_weights_memory = mkldnn::memory(batchnorm_prim_desc.weights_desc(), engine, weights.buffer()); + batch_normalization_forward(batchnorm_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, batchnorm_src_memory}, + {MKLDNN_ARG_MEAN, batchnorm_mean_memory}, + {MKLDNN_ARG_VARIANCE, batchnorm_variance_memory}, + {MKLDNN_ARG_WEIGHTS, batchnorm_weights_memory}, + {MKLDNN_ARG_DST, batchnorm_dst_memory}}); + } else { + batch_normalization_forward(batchnorm_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, batchnorm_src_memory}, + {MKLDNN_ARG_MEAN, batchnorm_mean_memory}, + {MKLDNN_ARG_VARIANCE, batchnorm_variance_memory}, + {MKLDNN_ARG_DST, batchnorm_dst_memory}}); + } + if (batchnorm_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(batchnorm_dst_memory, user_dst_memory).execute(stream, batchnorm_dst_memory, + user_dst_memory); + } + stream.wait(); + + return Status::OK(); + } + + PLATFORM_CHECK(batchnorm_new) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto mean = INPUT_VARIABLE(1); + auto variance = INPUT_VARIABLE(2); + NDArray *gamma = nullptr; + NDArray *beta = nullptr; + + auto output = OUTPUT_VARIABLE(0); + + const bool applyScale = (bool) INT_ARG(0); + const bool applyOffset = (bool) INT_ARG(1); + const double epsilon = T_ARG(0); + + if (applyScale) + gamma = INPUT_VARIABLE(3); + if (applyOffset) + beta = INPUT_VARIABLE(3 + static_cast(applyScale)); + + std::vector axes; + if (block.numI() > 2) + for (int i = 2; i < block.numI(); ++i) + axes.push_back(INT_ARG(i)); + else + axes.push_back(input->rankOf() - 1); + + return block.isUseMKLDNN() && + nd4j::MKLDNNStream::isSupported({input, mean, variance, gamma, beta, output}) && + axes.size() == 1; + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp new file mode 100644 index 000000000..187668899 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + static void conv2d_mkldnn(nd4j::graph::Context &block, const NDArray *input, const NDArray *weights, + const NDArray *bias, NDArray *output, const int kH, const int kW, const int sH, + const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, + const int isNCHW) { + + int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width; + int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, + indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH); + + if(isSameMode) // SAME + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc conv_src_md(empty), conv_weights_md(empty), conv_bias_md(empty), conv_dst_md( + empty); + mkldnn::memory::desc user_src_md(empty), user_weights_md(empty), user_bias_md(empty), user_dst_md( + empty); + mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; + mkldnnUtils::getMKLDNNMemoryDescConv2d(kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW, + bS, iC, iH, iW, oC, oH, oW, input, nullptr, weights, nullptr, + bias, output, + &conv_src_md, nullptr, &conv_weights_md, nullptr, + &conv_bias_md, &conv_dst_md, + &user_src_md, nullptr, &user_weights_md, nullptr, + &user_bias_md, &user_dst_md, + conv_strides, conv_padding, conv_padding_r); + auto conv_desc = bias != nullptr + ? convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r) + : convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, const_cast(input)->buffer()); + auto user_weights_memory = mkldnn::memory(user_weights_md, engine, + const_cast(weights)->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + auto conv_src_memory = user_src_memory; + if (conv_prim_desc.src_desc() != user_src_memory.get_desc()) { + conv_src_memory = mkldnn::memory(conv_prim_desc.src_desc(), engine); + reorder(user_src_memory, conv_src_memory).execute(stream, user_src_memory, conv_src_memory); + } + auto conv_weights_memory = user_weights_memory; + if (conv_prim_desc.weights_desc() != user_weights_memory.get_desc()) { + conv_weights_memory = mkldnn::memory(conv_prim_desc.weights_desc(), engine); + reorder(user_weights_memory, conv_weights_memory).execute(stream, user_weights_memory, + conv_weights_memory); + } + auto conv_dst_memory = user_dst_memory; + if (conv_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + conv_dst_memory = mkldnn::memory(conv_prim_desc.dst_desc(), engine); + } + if (bias != nullptr) { + auto conv_bias_memory = mkldnn::memory(conv_prim_desc.bias_desc(), engine, + const_cast(bias)->buffer()); + convolution_forward(conv_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, conv_src_memory}, + {MKLDNN_ARG_WEIGHTS, conv_weights_memory}, + {MKLDNN_ARG_BIAS, conv_bias_memory}, + {MKLDNN_ARG_DST, conv_dst_memory}}); + } else { + convolution_forward(conv_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, conv_src_memory}, + {MKLDNN_ARG_WEIGHTS, conv_weights_memory}, + {MKLDNN_ARG_DST, conv_dst_memory}}); + } + if (conv_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(conv_dst_memory, user_dst_memory).execute(stream, conv_dst_memory, user_dst_memory); + } + stream.wait(); + } + + PLATFORM_IMPL(conv2d) { + auto input = INPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) + auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always + auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] + + auto output = OUTPUT_VARIABLE( + 0); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW) + + int sH = INT_ARG(2); // strides height + int sW = INT_ARG(3); // strides width + int pH = INT_ARG(4); // paddings height + int pW = INT_ARG(5); // paddings width + int dH = INT_ARG(6); // dilations height + int dW = INT_ARG(7); // dilations width + int isSameMode = INT_ARG(8); // 0-VALID, 1-SAME + bool isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC + + int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast(weights->sizeAt(0)); // filter(kernel) height + int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast(weights->sizeAt(1)); // filter(kernel) width + + conv2d_mkldnn(block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW); + + return Status::OK(); + } + + PLATFORM_CHECK(conv2d) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto weights = INPUT_VARIABLE(1); + + // conv2d is only available for float32 dtype + return block.isUseMKLDNN() && input->dataType() == nd4j::DataType::FLOAT32 && + weights->dataType() == nd4j::DataType::FLOAT32; + } + } + } +} diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d_bp.cpp new file mode 100644 index 000000000..104684e97 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d_bp.cpp @@ -0,0 +1,243 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(conv2d_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) + auto weights = INPUT_VARIABLE( + 1); // [kH, kW, iC, oC] always + auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE( + 2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next + + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon + auto gradW = OUTPUT_VARIABLE( + 1); // [kH, kW, iC, oC] always + auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC] + + int kH = INT_ARG(0); // filter(kernel) height + int kW = INT_ARG(1); // filter(kernel) width + int sH = INT_ARG(2); // strides height + int sW = INT_ARG(3); // strides width + int pH = INT_ARG(4); // paddings height + int pW = INT_ARG(5); // paddings width + int dH = INT_ARG(6); // dilations height + int dW = INT_ARG(7); // dilations width + int isSameMode = INT_ARG(8); // 0-VALID, 1-SAME + int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC + + REQUIRE_TRUE(input->rankOf() == 4, 0, + "CUSTOM CONV2D_BP OP: rank of input array must be equal to 4, but got %i instead !", + input->rankOf()); + REQUIRE_TRUE(weights->rankOf() == 4, 0, + "CUSTOM CONV2D_BP OP: rank of weights array must be equal to 4, but got %i instead !", + weights->rankOf()); + REQUIRE_TRUE(gradO->rankOf() == 4, 0, + "CUSTOM CONV2D_BP OP: rank of output's gradients (next epsilon) array must be equal to 4, but got %i instead !", + gradO->rankOf()); + + int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width; + int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, + indIiH, indWiC, indWoC, indWkH, indOoH); + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc conv_src_md(empty), conv_diff_src_md(empty), conv_weights_md(empty), + conv_diff_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_weights_md(empty), + user_diff_weights_md(empty), user_bias_md(empty), user_dst_md(empty); + mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; + mkldnnUtils::getMKLDNNMemoryDescConv2d(kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW, + bS, iC, iH, iW, oC, oH, oW, input, gradI, weights, gradW, + gradB, gradO, + &conv_src_md, &conv_diff_src_md, &conv_weights_md, + &conv_diff_weights_md, &conv_bias_md, &conv_dst_md, + &user_src_md, &user_diff_src_md, &user_weights_md, + &user_diff_weights_md, &user_bias_md, &user_dst_md, + conv_strides, conv_padding, conv_padding_r); + auto conv_desc = gradB != nullptr + ? convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r) + : convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r); + auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, mkldnnUtils::getEngine( + LaunchContext::defaultContext()->engine())); + if (gradW != nullptr) { + auto convW_desc = gradB != nullptr + ? convolution_backward_weights::desc( + algorithm::convolution_auto, conv_src_md, conv_diff_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, conv_padding_r) + : convolution_backward_weights::desc( + algorithm::convolution_auto, conv_src_md, conv_diff_weights_md, + conv_dst_md, conv_strides, conv_padding, conv_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto convW_prim_desc = convolution_backward_weights::primitive_desc(convW_desc, engine, + conv_prim_desc); + auto userW_src_memory = mkldnn::memory(user_src_md, engine, + const_cast(input)->buffer()); + auto userW_weights_memory = mkldnn::memory(user_diff_weights_md, engine, gradW->buffer()); + auto userW_dst_memory = mkldnn::memory(user_dst_md, engine, + const_cast(gradO)->buffer()); + + auto convW_src_memory = userW_src_memory; + if (convW_prim_desc.src_desc() != userW_src_memory.get_desc()) { + convW_src_memory = mkldnn::memory(convW_prim_desc.src_desc(), engine); + reorder(userW_src_memory, convW_src_memory).execute(stream, userW_src_memory, + convW_src_memory); + } + + auto convW_weights_memory = userW_weights_memory; + if (convW_prim_desc.diff_weights_desc() != userW_weights_memory.get_desc()) { + convW_weights_memory = mkldnn::memory(convW_prim_desc.diff_weights_desc(), engine); + } + + auto convW_dst_memory = userW_dst_memory; + if (convW_prim_desc.diff_dst_desc() != userW_dst_memory.get_desc()) { + convW_dst_memory = mkldnn::memory(convW_prim_desc.diff_dst_desc(), engine); + reorder(userW_dst_memory, convW_dst_memory).execute(stream, userW_dst_memory, + convW_dst_memory); + } + + if (gradB != nullptr) { + auto convW_bias_memory = mkldnn::memory(convW_prim_desc.diff_bias_desc(), engine, + gradB->buffer()); + convolution_backward_weights(convW_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, convW_src_memory}, + {MKLDNN_ARG_DIFF_DST, convW_dst_memory}, + {MKLDNN_ARG_DIFF_WEIGHTS, convW_weights_memory}, + {MKLDNN_ARG_DIFF_BIAS, convW_bias_memory}}); + } else { + convolution_backward_weights(convW_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, convW_src_memory}, + {MKLDNN_ARG_DIFF_DST, convW_dst_memory}, + {MKLDNN_ARG_DIFF_WEIGHTS, convW_weights_memory}}); + } + + if (convW_prim_desc.diff_weights_desc() != userW_weights_memory.get_desc()) { + reorder(convW_weights_memory, userW_weights_memory).execute(stream, convW_weights_memory, + userW_weights_memory); + } + + stream.wait(); + } + + if (gradI != nullptr) { + auto convI_desc = + convolution_backward_data::desc(algorithm::convolution_auto, conv_diff_src_md, + conv_weights_md, conv_dst_md, conv_strides, + conv_padding, conv_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto convI_prim_desc = convolution_backward_data::primitive_desc(convI_desc, engine, + conv_prim_desc); + auto userI_src_memory = mkldnn::memory(user_diff_src_md, engine, gradI->buffer()); + auto userI_weights_memory = mkldnn::memory(user_weights_md, engine, + const_cast(weights)->buffer()); + auto userI_dst_memory = mkldnn::memory(user_dst_md, engine, + const_cast(gradO)->buffer()); + + auto convI_src_memory = userI_src_memory; + if (convI_prim_desc.diff_src_desc() != userI_src_memory.get_desc()) { + convI_src_memory = mkldnn::memory(convI_prim_desc.diff_src_desc(), engine); + } + + auto convI_weights_memory = userI_weights_memory; + if (convI_prim_desc.weights_desc() != userI_weights_memory.get_desc()) { + convI_weights_memory = mkldnn::memory(convI_prim_desc.weights_desc(), engine); + reorder(userI_weights_memory, convI_weights_memory).execute(stream, userI_weights_memory, + convI_weights_memory); + } + + auto convI_dst_memory = userI_dst_memory; + if (convI_prim_desc.diff_dst_desc() != userI_dst_memory.get_desc()) { + convI_dst_memory = mkldnn::memory(convI_prim_desc.diff_dst_desc(), engine); + reorder(userI_dst_memory, convI_dst_memory).execute(stream, userI_dst_memory, + convI_dst_memory); + } + + convolution_backward_data(convI_prim_desc).execute(stream, + {{MKLDNN_ARG_DIFF_DST, convI_dst_memory}, + {MKLDNN_ARG_WEIGHTS, convI_weights_memory}, + {MKLDNN_ARG_DIFF_SRC, convI_src_memory}}); + + if (convI_prim_desc.diff_src_desc() != userI_src_memory.get_desc()) { + reorder(convI_src_memory, userI_src_memory).execute(stream, convI_src_memory, + userI_src_memory); + } + + stream.wait(); + }; + + return Status::OK(); + } + + PLATFORM_CHECK(conv2d_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) + auto weights = INPUT_VARIABLE( + 1); // [kH, kW, iC, oC] always + auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE( + 2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next + + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon + auto gradW = OUTPUT_VARIABLE( + 1); // [kH, kW, iC, oC] always + auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC] + + + return block.isUseMKLDNN() && + nd4j::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB}); + } + } + } +} diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp new file mode 100644 index 000000000..6e7716320 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp @@ -0,0 +1,167 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(conv3dnew) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always + auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto output = OUTPUT_VARIABLE( + 0); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW) + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "CUSTOM CONV3D OP: rank of input array must be equal to 5, but got %i instead !", + input->rankOf()); + REQUIRE_TRUE(weights->rankOf() == 5, 0, + "CUSTOM CONV3D OP: rank of weights array must be equal to 5, but got %i instead !", + weights->rankOf()); + + int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast(weights->sizeAt(0));// filter(kernel) depth + int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast(weights->sizeAt(1));// filter(kernel) height + int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast(weights->sizeAt(2));// filter(kernel) width + int sD = INT_ARG(3); // strides depth + int sH = INT_ARG(4); // strides height + int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + int dD = INT_ARG(9); // dilations depth + int dH = INT_ARG(10); // dilations height + int dW = INT_ARG(11); // dilations width + int isSameMode = INT_ARG(12); // 0-SAME, 1-VALID + int isNCDHW = + block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + std::string expectedWeightsShape = ShapeUtils::shapeAsString({kD, kH, kW, iC, oC}); + REQUIRE_TRUE(expectedWeightsShape == ShapeUtils::shapeAsString(weights), 0, + "CUSTOM CONV3D OP: wrong shape of weights array, expected is %s, but got %s instead !", + expectedWeightsShape.c_str(), ShapeUtils::shapeAsString(weights).c_str()); + if (bias) + REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, + "CUSTOM CONV3D OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", + oC, bias->rankOf(), bias->lengthOf()); + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); + + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc conv_src_md(empty), conv_weights_md(empty), conv_bias_md(empty), conv_dst_md( + empty); + mkldnn::memory::desc user_src_md(empty), user_weights_md(empty), user_bias_md(empty), user_dst_md( + empty); + mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; + mkldnnUtils::getMKLDNNMemoryDescConv3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, + isNCDHW, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, nullptr, weights, + nullptr, bias, output, + &conv_src_md, nullptr, &conv_weights_md, nullptr, + &conv_bias_md, &conv_dst_md, + &user_src_md, nullptr, &user_weights_md, nullptr, + &user_bias_md, &user_dst_md, + conv_strides, conv_padding, conv_padding_r); + auto conv_desc = bias != nullptr + ? convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r) + : convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r); + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, const_cast(input)->buffer()); + auto user_weights_memory = mkldnn::memory(user_weights_md, engine, + const_cast(weights)->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + auto conv_src_memory = user_src_memory; + if (conv_prim_desc.src_desc() != user_src_memory.get_desc()) { + conv_src_memory = mkldnn::memory(conv_prim_desc.src_desc(), engine); + reorder(user_src_memory, conv_src_memory).execute(stream, user_src_memory, conv_src_memory); + } + auto conv_weights_memory = user_weights_memory; + if (conv_prim_desc.weights_desc() != user_weights_memory.get_desc()) { + conv_weights_memory = mkldnn::memory(conv_prim_desc.weights_desc(), engine); + reorder(user_weights_memory, conv_weights_memory).execute(stream, user_weights_memory, + conv_weights_memory); + } + auto conv_dst_memory = user_dst_memory; + if (conv_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + conv_dst_memory = mkldnn::memory(conv_prim_desc.dst_desc(), engine); + } + if (bias != nullptr) { + auto conv_bias_memory = mkldnn::memory(conv_prim_desc.bias_desc(), engine, bias->buffer()); + convolution_forward(conv_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, conv_src_memory}, + {MKLDNN_ARG_WEIGHTS, conv_weights_memory}, + {MKLDNN_ARG_BIAS, conv_bias_memory}, + {MKLDNN_ARG_DST, conv_dst_memory}}); + } else { + convolution_forward(conv_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, conv_src_memory}, + {MKLDNN_ARG_WEIGHTS, conv_weights_memory}, + {MKLDNN_ARG_DST, conv_dst_memory}}); + } + if (conv_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(conv_dst_memory, user_dst_memory).execute(stream, conv_dst_memory, user_dst_memory); + } + stream.wait(); + + return Status::OK(); + } + + PLATFORM_CHECK(conv3dnew) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always + auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto output = OUTPUT_VARIABLE( + 0); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW) + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, weights, bias, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3dnew_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3dnew_bp.cpp new file mode 100644 index 000000000..c8af275f9 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3dnew_bp.cpp @@ -0,0 +1,263 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(conv3dnew_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto weights = INPUT_VARIABLE( + 1); // [kD, kH, kW, iC, oC] always + auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE( + 2); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next + + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon + auto gradW = OUTPUT_VARIABLE( + 1); // [kD, kH, kW, iC, oC] always + auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC] + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "CUSTOM CONV3D_BP OP: rank of input array must be equal to 5, but got %i instead !", + input->rankOf()); + REQUIRE_TRUE(weights->rankOf() == 5, 0, + "CUSTOM CONV3D_BP OP: rank of weights array must be equal to 5, but got %i instead !", + weights->rankOf()); + REQUIRE_TRUE(gradO->rankOf() == 5, 0, + "CUSTOM CONV3D_BP OP: rank of output gradients (next epsilon) array must be equal to 5, but got %i instead !", + gradO->rankOf()); + + int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast(weights->sizeAt(0));// filter(kernel) depth + int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast(weights->sizeAt(1));// filter(kernel) height + int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast(weights->sizeAt(2));// filter(kernel) width + int sD = INT_ARG(3); // strides depth + int sH = INT_ARG(4); // strides height + int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + int dD = INT_ARG(9); // dilations depth + int dH = INT_ARG(10); // dilations height + int dW = INT_ARG(11); // dilations width + int isSameMode = INT_ARG(12); // 1-SAME, 0-VALID + int isNDHWC = + block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + int trueoD, trueoH, trueoW; // true output depth/height/width + ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, + dW, iD, iH, iW, isSameMode); + + std::string expectedGradOShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, oC, trueoD, trueoH, trueoW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + std::string expectedWeightsShape = ShapeUtils::shapeAsString({kD, kH, kW, iC, oC}); + REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, + "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", + expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str()); + REQUIRE_TRUE(expectedWeightsShape == ShapeUtils::shapeAsString(weights), 0, + "CUSTOM CONV3D_BP OP: wrong shape of weights array, expected is %s, but got %s instead !", + expectedWeightsShape.c_str(), ShapeUtils::shapeAsString(weights).c_str()); + if (bias) + REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, + "CUSTOM CONV3D_BP OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", + oC, bias->rankOf(), bias->lengthOf()); + + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc conv_src_md(empty), conv_diff_src_md(empty), conv_weights_md(empty), + conv_diff_weights_md(empty), conv_bias_md(empty), conv_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_weights_md(empty), + user_diff_weights_md(empty), user_bias_md(empty), user_dst_md(empty); + mkldnn::memory::dims conv_strides, conv_padding, conv_padding_r; + mkldnnUtils::getMKLDNNMemoryDescConv3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isSameMode, + isNDHWC, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, gradI, weights, + gradW, gradB, gradO, + &conv_src_md, &conv_diff_src_md, &conv_weights_md, + &conv_diff_weights_md, &conv_bias_md, &conv_dst_md, + &user_src_md, &user_diff_src_md, &user_weights_md, + &user_diff_weights_md, &user_bias_md, &user_dst_md, + conv_strides, conv_padding, conv_padding_r); + auto conv_desc = gradB != nullptr + ? convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r) + : convolution_forward::desc(prop_kind::forward, + algorithm::convolution_auto, conv_src_md, + conv_weights_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r); + auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, mkldnnUtils::getEngine( + LaunchContext::defaultContext()->engine())); + if (gradW != nullptr) { + auto convW_desc = gradB != nullptr + ? convolution_backward_weights::desc( + algorithm::convolution_auto, conv_src_md, conv_diff_weights_md, conv_bias_md, + conv_dst_md, conv_strides, conv_padding, conv_padding_r) + : convolution_backward_weights::desc( + algorithm::convolution_auto, conv_src_md, conv_diff_weights_md, + conv_dst_md, conv_strides, conv_padding, conv_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto convW_prim_desc = convolution_backward_weights::primitive_desc(convW_desc, engine, + conv_prim_desc); + auto userW_src_memory = mkldnn::memory(user_src_md, engine, + const_cast(input)->buffer()); + auto userW_weights_memory = mkldnn::memory(user_diff_weights_md, engine, gradW->buffer()); + auto userW_dst_memory = mkldnn::memory(user_dst_md, engine, + const_cast(gradO)->buffer()); + + auto convW_src_memory = userW_src_memory; + if (convW_prim_desc.src_desc() != userW_src_memory.get_desc()) { + convW_src_memory = mkldnn::memory(convW_prim_desc.src_desc(), engine); + reorder(userW_src_memory, convW_src_memory).execute(stream, userW_src_memory, + convW_src_memory); + } + + auto convW_weights_memory = userW_weights_memory; + if (convW_prim_desc.diff_weights_desc() != userW_weights_memory.get_desc()) { + convW_weights_memory = mkldnn::memory(convW_prim_desc.diff_weights_desc(), engine); + } + + auto convW_dst_memory = userW_dst_memory; + if (convW_prim_desc.diff_dst_desc() != userW_dst_memory.get_desc()) { + convW_dst_memory = mkldnn::memory(convW_prim_desc.diff_dst_desc(), engine); + reorder(userW_dst_memory, convW_dst_memory).execute(stream, userW_dst_memory, + convW_dst_memory); + } + + if (gradB != nullptr) { + auto convW_bias_memory = mkldnn::memory(convW_prim_desc.diff_bias_desc(), engine, + gradB->buffer()); + convolution_backward_weights(convW_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, convW_src_memory}, + {MKLDNN_ARG_DIFF_DST, convW_dst_memory}, + {MKLDNN_ARG_DIFF_WEIGHTS, convW_weights_memory}, + {MKLDNN_ARG_DIFF_BIAS, convW_bias_memory}}); + } else { + convolution_backward_weights(convW_prim_desc).execute(stream, + {{MKLDNN_ARG_SRC, convW_src_memory}, + {MKLDNN_ARG_DIFF_DST, convW_dst_memory}, + {MKLDNN_ARG_DIFF_WEIGHTS, convW_weights_memory}}); + } + + if (convW_prim_desc.diff_weights_desc() != userW_weights_memory.get_desc()) { + reorder(convW_weights_memory, userW_weights_memory).execute(stream, convW_weights_memory, + userW_weights_memory); + } + + stream.wait(); + } + if (gradI != nullptr) { + auto convI_desc = convolution_backward_data::desc(algorithm::convolution_auto, + conv_diff_src_md, conv_weights_md, + conv_dst_md, conv_strides, conv_padding, + conv_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto convI_prim_desc = convolution_backward_data::primitive_desc(convI_desc, engine, + conv_prim_desc); + auto userI_src_memory = mkldnn::memory(user_diff_src_md, engine, gradI->buffer()); + auto userI_weights_memory = mkldnn::memory(user_weights_md, engine, + const_cast(weights)->buffer()); + auto userI_dst_memory = mkldnn::memory(user_dst_md, engine, + const_cast(gradO)->buffer()); + + auto convI_src_memory = userI_src_memory; + if (convI_prim_desc.diff_src_desc() != userI_src_memory.get_desc()) { + convI_src_memory = mkldnn::memory(convI_prim_desc.diff_src_desc(), engine); + } + + auto convI_weights_memory = userI_weights_memory; + if (convI_prim_desc.weights_desc() != userI_weights_memory.get_desc()) { + convI_weights_memory = mkldnn::memory(convI_prim_desc.weights_desc(), engine); + reorder(userI_weights_memory, convI_weights_memory).execute(stream, userI_weights_memory, + convI_weights_memory); + } + + auto convI_dst_memory = userI_dst_memory; + if (convI_prim_desc.diff_dst_desc() != userI_dst_memory.get_desc()) { + convI_dst_memory = mkldnn::memory(convI_prim_desc.diff_dst_desc(), engine); + reorder(userI_dst_memory, convI_dst_memory).execute(stream, userI_dst_memory, + convI_dst_memory); + } + + convolution_backward_data(convI_prim_desc).execute(stream, + {{MKLDNN_ARG_DIFF_DST, convI_dst_memory}, + {MKLDNN_ARG_WEIGHTS, convI_weights_memory}, + {MKLDNN_ARG_DIFF_SRC, convI_src_memory}}); + + if (convI_prim_desc.diff_src_desc() != userI_src_memory.get_desc()) { + reorder(convI_src_memory, userI_src_memory).execute(stream, convI_src_memory, + userI_src_memory); + } + + stream.wait(); + } + + return Status::OK(); + } + + PLATFORM_CHECK(conv3dnew_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto weights = INPUT_VARIABLE( + 1); // [kD, kH, kW, iC, oC] always + auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] + auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE( + 2); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next + + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon + auto gradW = OUTPUT_VARIABLE( + 1); // [kD, kH, kW, iC, oC] always + auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC] + + return block.isUseMKLDNN() && + nd4j::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp new file mode 100644 index 000000000..aa4f9272a --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(lrn) { + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + REQUIRE_TRUE(input->rankOf() == 4, 0, "lrn: Input rank of 4 expected, but got %i instead", + input->rankOf()); + + double alpha = T_ARG(1); + double beta = T_ARG(2); + double bias = T_ARG(0); + int depth = INT_ARG(0); + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc lrn_src_md(empty), lrn_dst_md(empty), user_src_md(empty), user_dst_md(empty); + + mkldnnUtils::getMKLDNNMemoryDescLrn(input, nullptr, output, &lrn_src_md, nullptr, &lrn_dst_md, + &user_src_md, nullptr, &user_dst_md, input->rankOf() - 1); + + auto lrn_desc = lrn_forward::desc(prop_kind::forward_inference, algorithm::lrn_across_channels, + lrn_src_md, (2 * depth + 1), alpha * (2 * depth + 1), beta, bias); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + + auto lrn_src_memory = user_src_memory; + if (lrn_prim_desc.src_desc() != user_src_memory.get_desc()) { + lrn_src_memory = mkldnn::memory(lrn_prim_desc.src_desc(), engine); + reorder(user_src_memory, lrn_src_memory).execute(stream, user_src_memory, lrn_src_memory); + } + + auto lrn_dst_memory = user_dst_memory; + if (lrn_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + lrn_dst_memory = mkldnn::memory(lrn_prim_desc.dst_desc(), engine); + } + + lrn_forward(lrn_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, lrn_src_memory}, + {MKLDNN_ARG_DST, lrn_dst_memory}}); + + if (lrn_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(lrn_dst_memory, user_dst_memory).execute(stream, lrn_dst_memory, user_dst_memory); + } + + stream.wait(); + + return Status::OK(); + }; + + PLATFORM_CHECK(lrn) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp new file mode 100644 index 000000000..86115d723 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp @@ -0,0 +1,149 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(maxpool2d) { + auto input = INPUT_VARIABLE(0); + + REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead", + input->rankOf()); + + // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode; + auto argI = *(block.getIArguments()); + auto output = OUTPUT_VARIABLE(0); + + const auto kH = INT_ARG(0); + const auto kW = INT_ARG(1); + const auto sH = INT_ARG(2); + const auto sW = INT_ARG(3); + int pH = INT_ARG(4); + int pW = INT_ARG(5); + const auto dH = INT_ARG(6); + const auto dW = INT_ARG(7); + const auto isSameMode = static_cast(INT_ARG(8)); + + REQUIRE_TRUE(dH != 0 && dW != 0, 0, "AVGPOOL2D op: dilation must not be zero, but got instead {%i, %i}", + dH, dW); + + int oH = 0; + int oW = 0; + + int isNCHW = block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1; // INT_ARG(10): 0-NCHW, 1-NHWC + + const int iH = static_cast(isNCHW ? input->sizeAt(2) : input->sizeAt(1)); + const int iW = static_cast(isNCHW ? input->sizeAt(3) : input->sizeAt(2)); + + if (!isNCHW) { + input = new NDArray( + input->permute({0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + output = new NDArray( + output->permute({0, 3, 1, 2})); // [bS, oH, oW, iC] -> [bS, iC, oH, oW] + } + + ConvolutionUtils::calcOutSizePool2D(oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, isSameMode); + + if (isSameMode) + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + const int bS = input->sizeAt(0); + const int iC = input->sizeAt(1); + const int oC = output->sizeAt(1); + + auto poolingMode = PoolingType::MAX_POOL; + int extraParam0 = 1; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + + mkldnnUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, + true, + bS, iC, iH, iW, oC, oH, oW, input, nullptr, output, + algorithm, + &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, + &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, + pool_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + + auto pool_src_memory = user_src_memory; + mkldnn::stream stream(engine); + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + + auto pool_dst_memory = user_dst_memory; + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + } + + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}}); + + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(pool_dst_memory, user_dst_memory).execute(stream, pool_dst_memory, user_dst_memory); + } + + stream.wait(); + + if (!isNCHW) { + delete input; + delete output; + } + + return Status::OK(); + } + + PLATFORM_CHECK(maxpool2d) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp new file mode 100644 index 000000000..aaead1f26 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d_bp.cpp @@ -0,0 +1,178 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(maxpool2d_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW) + auto gradO = INPUT_VARIABLE( + 1); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon + + int kH = INT_ARG(0); // filter(kernel) height + int kW = INT_ARG(1); // filter(kernel) width + int sH = INT_ARG(2); // strides height + int sW = INT_ARG(3); // strides width + int pH = INT_ARG(4); // paddings height + int pW = INT_ARG(5); // paddings width + int dH = INT_ARG(6); // dilations height + int dW = INT_ARG(7); // dilations width + int isSameMode = INT_ARG(8); // 0-VALID, 1-SAME + int extraParam0 = INT_ARG(9); + int isNCHW = + block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1; // INT_ARG(10): 0-NCHW, 1-NHWC + + REQUIRE_TRUE(input->rankOf() == 4, 0, + "AVGPOOL2D_BP op: input should have rank of 4, but got %i instead", input->rankOf()); + REQUIRE_TRUE(dH != 0 && dW != 0, 0, + "AVGPOOL2D_BP op: dilation must not be zero, but got instead {%i, %i}", dH, dW); + + int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width; + int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, + indIiH, indWiC, indWoC, indWkH, indOoH); + + std::string expectedGradOShape = ShapeUtils::shapeAsString( + ShapeUtils::composeShapeUsingDimsAndIdx({bS, iC, oH, oW, 0, indIOioC, indIiH, indIiH + 1})); + std::string expectedGradIShape = ShapeUtils::shapeAsString( + ShapeUtils::composeShapeUsingDimsAndIdx({bS, iC, iH, iW, 0, indIOioC, indIiH, indIiH + 1})); + REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, + "AVGPOOL2D_BP op: wrong shape of output's gradients array (next epsilon), expected is %s, but got %s instead !", + expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str()); + REQUIRE_TRUE(expectedGradIShape == ShapeUtils::shapeAsString(gradI), 0, + "AVGPOOL2D_BP op: wrong shape of input's gradients array (epsilon), expected is %s, but got %s instead !", + expectedGradIShape.c_str(), ShapeUtils::shapeAsString(gradI).c_str()); + + + if (!isNCHW) { + input = new NDArray(input->permute( + {0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + gradI = new NDArray(gradI->permute( + {0, 3, 1, 2})); // [bS, iH, iW, iC] -> [bS, iC, iH, iW] + gradO = new NDArray(gradO->permute( + {0, 3, 1, 2})); // [bS, oH, oW, iC] -> [bS, iC, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); + + auto poolingMode = PoolingType::MAX_POOL; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + + mkldnnUtils::getMKLDNNMemoryDescPool2d(kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0, + true, + bS, iC, iH, iW, oC, oH, oW, input, gradI, gradO, algorithm, + &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, + &user_diff_src_md, &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + // input is sometimes null, so we can't rely on pool_src_md being valid + auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, + input->buffer() != nullptr ? pool_src_md : pool_diff_src_md, + pool_dst_md, pool_strides, pool_kernel, pool_padding, + pool_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + + auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); + auto userB_src_memory = mkldnn::memory(user_src_md, engine, gradI->buffer()); + auto userB_dst_memory = mkldnn::memory(user_dst_md, engine, gradO->buffer()); + + auto poolB_src_memory = userB_src_memory; + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_desc(), engine); + } + + auto poolB_dst_memory = userB_dst_memory; + if (poolB_prim_desc.diff_dst_desc() != userB_dst_memory.get_desc()) { + poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_desc(), engine); + reorder(userB_dst_memory, poolB_dst_memory).execute(stream, userB_dst_memory, poolB_dst_memory); + } + + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto pool_src_memory = user_src_memory; + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + + auto pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + auto pool_workspace_memory = mkldnn::memory(pool_prim_desc.workspace_desc(), engine); + + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}, + {MKLDNN_ARG_WORKSPACE, pool_workspace_memory}}); + // probably wrong, fix that + pooling_backward(poolB_prim_desc).execute(stream, {{MKLDNN_ARG_DIFF_DST, poolB_dst_memory}, + {MKLDNN_ARG_WORKSPACE, pool_workspace_memory}, + {MKLDNN_ARG_DIFF_SRC, poolB_src_memory}}); + + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + reorder(poolB_src_memory, userB_src_memory).execute(stream, poolB_src_memory, userB_src_memory); + } + + stream.wait(); + + if (!isNCHW) { + delete input; + delete gradI; + delete gradO; + } + + return Status::OK(); + } + + PLATFORM_CHECK(maxpool2d_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp new file mode 100644 index 000000000..b77059f8f --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp @@ -0,0 +1,155 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(maxpool3dnew) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto output = OUTPUT_VARIABLE( + 0); // [bS, oD, oH, oW, iC] (NDHWC) or [bS, iC, oD, oH, oW] (NCDHW) + + int kD = INT_ARG(0); // filter(kernel) depth + int kH = INT_ARG(1); // filter(kernel) height + int kW = INT_ARG(2); // filter(kernel) width + int sD = INT_ARG(3); // strides depth + int sH = INT_ARG(4); // strides height + int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + int dD = INT_ARG(9); // dilations depth + int dH = INT_ARG(10); // dilations height + int dW = INT_ARG(11); // dilations width + int isSameMode = INT_ARG(12); // 1-SAME, 0-VALID + // int extraParam0 = INT_ARG(13); // unnecessary for max case, required only for avg and pnorm cases + int isNCDHW = block.getIArguments()->size() > 14 ? !INT_ARG(14) : 1; // 1-NDHWC, 0-NCDHW + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "MAXPOOL3DNEW OP: rank of input array must be equal to 5, but got %i instead !", + input->rankOf()); + REQUIRE_TRUE(dD != 0 && dH != 0 && dW != 0, 0, + "MAXPOOL3DNEW op: dilation must not be zero, but got instead {%i, %i, %i}", dD, dH, dW); + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + std::string expectedOutputShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, oD, oH, oW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + REQUIRE_TRUE(expectedOutputShape == ShapeUtils::shapeAsString(output), 0, + "MAXPOOL3D op: wrong shape of output array, expected is %s, but got %s instead !", + expectedOutputShape.c_str(), ShapeUtils::shapeAsString(output).c_str()); + // REQUIRE_TRUE(iD >= kD && iH >= kH && iW >= kW, 0, "MAXPOOL3D OP: the input depth/height/width must be greater or equal to kernel(filter) depth/height/width, but got [%i, %i, %i] and [%i, %i, %i] correspondingly !", iD,iH,iW, kD,kH,kW); + // REQUIRE_TRUE(kD/2 >= pD && kH/2 >= pH && kW/2 >= pW, 0, "MAXPOOL3D OP: pad depth/height/width must not be greater than half of kernel depth/height/width, but got [%i, %i, %i] and [%i, %i, %i] correspondingly !", pD,pH,pW, kD,kH,kW); + + if (!isNCDHW) { + input = new NDArray( + input->permute({0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + output = new NDArray( + output->permute({0, 4, 1, 2, 3})); // [bS, oD, oH, oW, iC] -> [bS, iC, oD, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, + dW); + + + auto poolingMode = PoolingType::MAX_POOL; + auto extraParam0 = 1; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + + mkldnnUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, + extraParam0, true, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, nullptr, output, + algorithm, + &pool_src_md, nullptr, &pool_dst_md, &user_src_md, nullptr, + &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto pool_desc = pooling_forward::desc(prop_kind::forward_inference, algorithm, pool_src_md, + pool_dst_md, pool_strides, pool_kernel, pool_padding, + pool_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + auto user_dst_memory = mkldnn::memory(user_dst_md, engine, output->buffer()); + + auto pool_src_memory = user_src_memory; + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + + auto pool_dst_memory = user_dst_memory; + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + } + + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}}); + + if (pool_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(pool_dst_memory, user_dst_memory).execute(stream, pool_dst_memory, user_dst_memory); + } + + stream.wait(); + + + if (!isNCDHW) { + delete input; + delete output; + } + + return Status::OK(); + } + + PLATFORM_CHECK(maxpool3dnew) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp new file mode 100644 index 000000000..af0be5897 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling_3d_bp.cpp @@ -0,0 +1,185 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +#include +#include "mkldnnUtils.h" +#include + +using namespace mkldnn; + +namespace nd4j { + namespace ops { + namespace platforms { + PLATFORM_IMPL(maxpool3dnew_bp) { + auto input = INPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW) + auto gradO = INPUT_VARIABLE( + 1); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next + auto gradI = OUTPUT_VARIABLE( + 0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon + + const int kD = INT_ARG(0); // filter(kernel) depth + const int kH = INT_ARG(1); // filter(kernel) height + const int kW = INT_ARG(2); // filter(kernel) width + const int sD = INT_ARG(3); // strides depth + const int sH = INT_ARG(4); // strides height + const int sW = INT_ARG(5); // strides width + int pD = INT_ARG(6); // paddings depth + int pH = INT_ARG(7); // paddings height + int pW = INT_ARG(8); // paddings width + const int dD = INT_ARG(9); // dilations depth + const int dH = INT_ARG(10); // dilations height + const int dW = INT_ARG(11); // dilations width + const int isSameMode = INT_ARG(12); // 1-SAME, 0-VALID + // int extraParam0 = INT_ARG(13); // unnecessary for max case, required only for avg and pnorm cases + int isNCDHW = block.getIArguments()->size() > 14 ? !INT_ARG(14) : 1; // 1-NDHWC, 0-NCDHW + + REQUIRE_TRUE(input->rankOf() == 5, 0, + "MAXPOOL3D_BP op: input should have rank of 5, but got %i instead", input->rankOf()); + REQUIRE_TRUE(dD != 0 && dH != 0 && dW != 0, 0, + "MAXPOOL3DNEW op: dilation must not be zero, but got instead {%i, %i, %i}", dD, dH, dW); + + int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width; + int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes + ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, + indIOioC, indIOioD, indWiC, indWoC, indWkD); + + std::string expectedGradOShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, oD, oH, oW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + std::string expectedGradIShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx( + {bS, iC, iD, iH, iW, 0, indIOioC, indIOioD, indIOioD + 1, indIOioD + 2})); + REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, + "MAXPOOL3D_BP op: wrong shape of output's gradients array (next epsilon), expected is %s, but got %s instead !", + expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str()); + REQUIRE_TRUE(expectedGradIShape == ShapeUtils::shapeAsString(gradI), 0, + "MAXPOOL3D_BP op: wrong shape of input's gradients array (epsilon), expected is %s, but got %s instead !", + expectedGradIShape.c_str(), ShapeUtils::shapeAsString(gradI).c_str()); + + if (!isNCDHW) { + input = new NDArray(input->permute( + {0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + gradI = new NDArray(gradI->permute( + {0, 4, 1, 2, 3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW] + gradO = new NDArray(gradO->permute( + {0, 4, 1, 2, 3})); // [bS, oD, oH, oW, iC] -> [bS, iC, oD, oH, oW] + } + + if (isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, + dW); + + + auto poolingMode = PoolingType::MAX_POOL; + auto extraParam0 = 1; + + mkldnn_memory_desc_t empty; + mkldnn::memory::desc pool_src_md(empty), pool_diff_src_md(empty), pool_dst_md(empty); + mkldnn::memory::desc user_src_md(empty), user_diff_src_md(empty), user_dst_md(empty); + mkldnn::memory::dims pool_strides, pool_kernel, pool_padding, pool_padding_r; + mkldnn::algorithm algorithm; + + mkldnnUtils::getMKLDNNMemoryDescPool3d(kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, + extraParam0, true, + bS, iC, iD, iH, iW, oC, oD, oH, oW, input, gradI, gradO, + algorithm, + &pool_src_md, &pool_diff_src_md, &pool_dst_md, &user_src_md, + &user_diff_src_md, &user_dst_md, + pool_strides, pool_kernel, pool_padding, pool_padding_r); + + // input is sometimes null, so we can't rely on pool_src_md being valid + if (input->buffer() == nullptr) { + pool_src_md = pool_diff_src_md; + user_src_md = user_diff_src_md; + } + auto pool_desc = pooling_forward::desc(prop_kind::forward, algorithm, pool_src_md, pool_dst_md, pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto engine = mkldnnUtils::getEngine(LaunchContext::defaultContext()->engine()); + mkldnn::stream stream(engine); + auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, engine); + + auto poolB_desc = pooling_backward::desc(algorithm, pool_diff_src_md, pool_dst_md, pool_strides, pool_kernel, pool_padding, pool_padding_r); + + auto poolB_prim_desc = pooling_backward::primitive_desc(poolB_desc, engine, pool_prim_desc); + auto userB_src_memory = mkldnn::memory(user_diff_src_md, engine, gradI->buffer()); + auto userB_dst_memory = mkldnn::memory(user_dst_md, engine, gradO->buffer()); + + auto poolB_src_memory = userB_src_memory; + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + poolB_src_memory = mkldnn::memory(poolB_prim_desc.diff_src_desc(), engine); + } + + auto poolB_dst_memory = userB_dst_memory; + if (poolB_prim_desc.diff_dst_desc() != userB_dst_memory.get_desc()) { + poolB_dst_memory = mkldnn::memory(poolB_prim_desc.diff_dst_desc(), engine); + reorder(userB_dst_memory, poolB_dst_memory).execute(stream, userB_dst_memory, poolB_dst_memory); + } + + + auto user_src_memory = mkldnn::memory(user_src_md, engine, input->buffer()); + + auto pool_src_memory = user_src_memory; + if (pool_prim_desc.src_desc() != user_src_memory.get_desc()) { + pool_src_memory = mkldnn::memory(pool_prim_desc.src_desc(), engine); + reorder(user_src_memory, pool_src_memory).execute(stream, user_src_memory, pool_src_memory); + } + + auto pool_dst_memory = mkldnn::memory(pool_prim_desc.dst_desc(), engine); + auto pool_workspace_memory = mkldnn::memory(pool_prim_desc.workspace_desc(), engine); + + pooling_forward(pool_prim_desc).execute(stream, {{MKLDNN_ARG_SRC, pool_src_memory}, + {MKLDNN_ARG_DST, pool_dst_memory}, + {MKLDNN_ARG_WORKSPACE, pool_workspace_memory}}); + pooling_backward(poolB_prim_desc).execute(stream, {{MKLDNN_ARG_DIFF_DST, poolB_dst_memory}, + {MKLDNN_ARG_WORKSPACE, pool_workspace_memory}, + {MKLDNN_ARG_DIFF_SRC, poolB_src_memory}}); + + + if (poolB_prim_desc.diff_src_desc() != userB_src_memory.get_desc()) { + reorder(poolB_src_memory, userB_src_memory).execute(stream, poolB_src_memory, userB_src_memory); + } + + stream.wait(); + + if (!isNCDHW) { + delete input; + delete gradI; + delete gradO; + } + + return Status::OK(); + } + + PLATFORM_CHECK(maxpool3dnew_bp) { + // we don't want to use mkldnn if cpu doesn't support avx/avx2 + if (::optimalLevel() < 2) + return false; + + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); + + return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output}); + } + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp new file mode 100644 index 000000000..4fac4a1b7 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp @@ -0,0 +1,404 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// + +#include +#include "mkldnnUtils.h" + +using namespace mkldnn; + +namespace nd4j { + namespace mkldnnUtils { + void getMKLDNNMemoryDescPool2d( + int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, int poolingMode, int extraParam0, bool isNCHW, + int bS, int iC, int iH, int iW, int oC, int oH, int oW, + const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, + mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r) { + mkldnn::memory::dims pool_src_tz = { bS, iC, iH, iW }; + mkldnn::memory::dims pool_dst_tz = { bS, oC, oH, oW }; + + pool_strides = { sH, sW }; + pool_kernel = { kH, kW }; + pool_padding = { pH, pW }; + pool_padding_r = { (oH - 1) * sH - iH + kH - pH, + (oW - 1) * sW - iW + kW - pW }; + + algorithm = poolingMode == 0 ? algorithm::pooling_max + : extraParam0 == 0 ? algorithm::pooling_avg_exclude_padding + : algorithm::pooling_avg_include_padding; + auto type = mkldnn::memory::data_type::f32; + auto format = isNCHW ? mkldnn::memory::format_tag::nchw : mkldnn::memory::format_tag::nhwc; + auto supposed_to_be_any_format = mkldnn::memory::format_tag::nChw8c; // doesn't work with "any" + + if (src != nullptr && src->getBuffer() != nullptr && pool_src_md != nullptr) { + *pool_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); + *user_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[isNCHW ? 0 : 0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[isNCHW ? 1 : 3]; + user_src_md->data.format_desc.blocking.strides[2] = src->stridesOf()[isNCHW ? 2 : 1]; + user_src_md->data.format_desc.blocking.strides[3] = src->stridesOf()[isNCHW ? 3 : 2]; + } + + if (diff_src != nullptr && diff_src->getBuffer() != nullptr && pool_diff_src_md != nullptr) { + *pool_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); + *user_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[isNCHW ? 0 : 0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[isNCHW ? 1 : 3]; + user_diff_src_md->data.format_desc.blocking.strides[2] = diff_src->stridesOf()[isNCHW ? 2 : 1]; + user_diff_src_md->data.format_desc.blocking.strides[3] = diff_src->stridesOf()[isNCHW ? 3 : 2]; + } + + if (dst != nullptr && dst->getBuffer() != nullptr && pool_dst_md != nullptr) { + *pool_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, supposed_to_be_any_format); + *user_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[isNCHW ? 0 : 0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[isNCHW ? 1 : 3]; + user_dst_md->data.format_desc.blocking.strides[2] = dst->stridesOf()[isNCHW ? 2 : 1]; + user_dst_md->data.format_desc.blocking.strides[3] = dst->stridesOf()[isNCHW ? 3 : 2]; + } + }; + + + void getMKLDNNMemoryDescPool3d( + int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, int poolingMode, int extraParam0, bool isNCDHW, + int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, + const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, + mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r) { + mkldnn::memory::dims pool_src_tz = { bS, iC, iD, iH, iW }; + mkldnn::memory::dims pool_dst_tz = { bS, oC, oD, oH, oW }; + + pool_strides = { sD, sH, sW }; + pool_kernel = { kD, kH, kW }; + pool_padding = { pD, pH, pW }; + pool_padding_r = { (oD - 1) * sD - iD + kD - pD, + (oH - 1) * sH - iH + kH - pH, + (oW - 1) * sW - iW + kW - pW }; + + algorithm = poolingMode == 0 ? algorithm::pooling_max + : extraParam0 == 0 ? algorithm::pooling_avg_exclude_padding + : algorithm::pooling_avg_include_padding; + auto type = mkldnn::memory::data_type::f32; + auto format = isNCDHW ? mkldnn::memory::format_tag::ncdhw : mkldnn::memory::format_tag::ndhwc; + auto supposed_to_be_any_format = mkldnn::memory::format_tag::nCdhw8c; // doesn't work with "any" + + if (src != nullptr && src->getBuffer() != nullptr && pool_src_md != nullptr) { + *pool_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); + *user_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[isNCDHW ? 0 : 0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[isNCDHW ? 1 : 4]; + user_src_md->data.format_desc.blocking.strides[2] = src->stridesOf()[isNCDHW ? 2 : 1]; + user_src_md->data.format_desc.blocking.strides[3] = src->stridesOf()[isNCDHW ? 3 : 2]; + user_src_md->data.format_desc.blocking.strides[4] = src->stridesOf()[isNCDHW ? 4 : 3]; + } + + if (diff_src != nullptr && diff_src->getBuffer() != nullptr && pool_diff_src_md != nullptr) { + *pool_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, supposed_to_be_any_format); + *user_diff_src_md = mkldnn::memory::desc({ pool_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[isNCDHW ? 0 : 0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[isNCDHW ? 1 : 4]; + user_diff_src_md->data.format_desc.blocking.strides[2] = diff_src->stridesOf()[isNCDHW ? 2 : 1]; + user_diff_src_md->data.format_desc.blocking.strides[3] = diff_src->stridesOf()[isNCDHW ? 3 : 2]; + user_diff_src_md->data.format_desc.blocking.strides[4] = diff_src->stridesOf()[isNCDHW ? 4 : 3]; + } + + if (dst != nullptr && dst->getBuffer() != nullptr && pool_dst_md != nullptr) { + *pool_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, supposed_to_be_any_format); + *user_dst_md = mkldnn::memory::desc({ pool_dst_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[isNCDHW ? 0 : 0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[isNCDHW ? 1 : 4]; + user_dst_md->data.format_desc.blocking.strides[2] = dst->stridesOf()[isNCDHW ? 2 : 1]; + user_dst_md->data.format_desc.blocking.strides[3] = dst->stridesOf()[isNCDHW ? 3 : 2]; + user_dst_md->data.format_desc.blocking.strides[4] = dst->stridesOf()[isNCDHW ? 4 : 3]; + } + }; + + + + void getMKLDNNMemoryDescConv2d( + int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, bool isSameMode, bool isNCHW, + int bS, int iC, int iH, int iW, int oC, int oH, int oW, const NDArray* src, const NDArray* diff_src, + const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, + mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, + mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, + mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r) { + mkldnn::memory::dims conv_src_tz = { bS, iC, iH, iW }; + mkldnn::memory::dims conv_weights_tz = { oC, iC, kH, kW }; + mkldnn::memory::dims conv_bias_tz = { oC }; + mkldnn::memory::dims conv_dst_tz = { bS, oC, oH, oW }; + + conv_strides = { sH, sW }; + conv_padding = { pH, pW }; + conv_padding_r = { (oH - 1) * sH - iH + kH - pH, + (oW - 1) * sW - iW + kW - pW }; + + auto type = mkldnn::memory::data_type::f32; + auto format = isNCHW ? mkldnn::memory::format_tag::nchw : mkldnn::memory::format_tag::nhwc; + auto formatw = mkldnn::memory::format_tag::hwio; + + if (src != nullptr && conv_src_md != nullptr) { + *conv_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format_tag::any); + *user_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[isNCHW ? 0 : 0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[isNCHW ? 1 : 3]; + user_src_md->data.format_desc.blocking.strides[2] = src->stridesOf()[isNCHW ? 2 : 1]; + user_src_md->data.format_desc.blocking.strides[3] = src->stridesOf()[isNCHW ? 3 : 2]; + } + + if (diff_src != nullptr && conv_diff_src_md != nullptr) { + *conv_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format_tag::any); + *user_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[isNCHW ? 0 : 0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[isNCHW ? 1 : 3]; + user_diff_src_md->data.format_desc.blocking.strides[2] = diff_src->stridesOf()[isNCHW ? 2 : 1]; + user_diff_src_md->data.format_desc.blocking.strides[3] = diff_src->stridesOf()[isNCHW ? 3 : 2]; + } + + if (weights != nullptr && conv_weights_md != nullptr) { + *conv_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format_tag::any); + *user_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); + user_weights_md->data.format_kind = mkldnn_blocked; // overrides "formatw = hwio" + user_weights_md->data.format_desc.blocking.strides[0] = weights->stridesOf()[3]; + user_weights_md->data.format_desc.blocking.strides[1] = weights->stridesOf()[2]; + user_weights_md->data.format_desc.blocking.strides[2] = weights->stridesOf()[0]; + user_weights_md->data.format_desc.blocking.strides[3] = weights->stridesOf()[1]; + } + + if (diff_weights != nullptr && conv_diff_weights_md != nullptr) { + *conv_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format_tag::any); + *user_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); + user_diff_weights_md->data.format_kind = mkldnn_blocked; // overrides "formatw = hwio" + user_diff_weights_md->data.format_desc.blocking.strides[0] = diff_weights->stridesOf()[3]; + user_diff_weights_md->data.format_desc.blocking.strides[1] = diff_weights->stridesOf()[2]; + user_diff_weights_md->data.format_desc.blocking.strides[2] = diff_weights->stridesOf()[0]; + user_diff_weights_md->data.format_desc.blocking.strides[3] = diff_weights->stridesOf()[1]; + } + + if (bias != nullptr && conv_bias_md != nullptr) { + *conv_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format_tag::any); + *user_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format_tag::x); + } + + if (dst != nullptr && conv_dst_md != nullptr) { + *conv_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, mkldnn::memory::format_tag::any); + *user_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCHW ? nchw : nhwc" + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[isNCHW ? 0 : 0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[isNCHW ? 1 : 3]; + user_dst_md->data.format_desc.blocking.strides[2] = dst->stridesOf()[isNCHW ? 2 : 1]; + user_dst_md->data.format_desc.blocking.strides[3] = dst->stridesOf()[isNCHW ? 3 : 2]; + } + } + + void getMKLDNNMemoryDescConv3d( + int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, bool isSameMode, bool isNCDHW, + int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, const NDArray* src, const NDArray* diff_src, + const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, + mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, + mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, + mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r) { + mkldnn::memory::dims conv_src_tz = { bS, iC, iD, iH, iW }; + mkldnn::memory::dims conv_weights_tz = { oC, iC, kD, kH, kW }; + mkldnn::memory::dims conv_bias_tz = { oC }; + mkldnn::memory::dims conv_dst_tz = { bS, oC, oD, oH, oW }; + + conv_strides = { sD, sH, sW }; + conv_padding = { pD, pH, pW }; + conv_padding_r = { (oD - 1) * sD - iD + kD - pD, + (oH - 1) * sH - iH + kH - pH, + (oW - 1) * sW - iW + kW - pW }; + + auto type = mkldnn::memory::data_type::f32; + auto format = isNCDHW ? mkldnn::memory::format_tag::ncdhw : mkldnn::memory::format_tag::ndhwc; + auto formatw = mkldnn::memory::format_tag::dhwio; + + if (src != nullptr && conv_src_md != nullptr) { + *conv_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format_tag::any); + *user_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[isNCDHW ? 0 : 0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[isNCDHW ? 1 : 4]; + user_src_md->data.format_desc.blocking.strides[2] = src->stridesOf()[isNCDHW ? 2 : 1]; + user_src_md->data.format_desc.blocking.strides[3] = src->stridesOf()[isNCDHW ? 3 : 2]; + user_src_md->data.format_desc.blocking.strides[4] = src->stridesOf()[isNCDHW ? 4 : 3]; + } + + if (diff_src != nullptr && conv_diff_src_md != nullptr) { + *conv_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, mkldnn::memory::format_tag::any); + *user_diff_src_md = mkldnn::memory::desc({ conv_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[isNCDHW ? 0 : 0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[isNCDHW ? 1 : 4]; + user_diff_src_md->data.format_desc.blocking.strides[2] = diff_src->stridesOf()[isNCDHW ? 2 : 1]; + user_diff_src_md->data.format_desc.blocking.strides[3] = diff_src->stridesOf()[isNCDHW ? 3 : 2]; + user_diff_src_md->data.format_desc.blocking.strides[4] = diff_src->stridesOf()[isNCDHW ? 4 : 3]; + } + + if (weights != nullptr && conv_weights_md != nullptr) { + *conv_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format_tag::any); + *user_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); + user_weights_md->data.format_kind = mkldnn_blocked; // overrides "formatw = dhwio" + user_weights_md->data.format_desc.blocking.strides[0] = weights->stridesOf()[4]; + user_weights_md->data.format_desc.blocking.strides[1] = weights->stridesOf()[3]; + user_weights_md->data.format_desc.blocking.strides[2] = weights->stridesOf()[0]; + user_weights_md->data.format_desc.blocking.strides[3] = weights->stridesOf()[1]; + user_weights_md->data.format_desc.blocking.strides[4] = weights->stridesOf()[2]; + } + + if (diff_weights != nullptr && conv_diff_weights_md != nullptr) { + *conv_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, mkldnn::memory::format_tag::any); + *user_diff_weights_md = mkldnn::memory::desc({ conv_weights_tz }, type, formatw); + user_diff_weights_md->data.format_kind = mkldnn_blocked; // overrides "formatw = dhwio" + user_diff_weights_md->data.format_desc.blocking.strides[0] = diff_weights->stridesOf()[4]; + user_diff_weights_md->data.format_desc.blocking.strides[1] = diff_weights->stridesOf()[3]; + user_diff_weights_md->data.format_desc.blocking.strides[2] = diff_weights->stridesOf()[0]; + user_diff_weights_md->data.format_desc.blocking.strides[3] = diff_weights->stridesOf()[1]; + user_diff_weights_md->data.format_desc.blocking.strides[4] = diff_weights->stridesOf()[2]; + } + + if (bias != nullptr && conv_bias_md != nullptr) { + *conv_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format_tag::any); + *user_bias_md = mkldnn::memory::desc({ conv_bias_tz }, type, mkldnn::memory::format_tag::x); + } + + if (dst != nullptr && conv_dst_md != nullptr) { + *conv_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, mkldnn::memory::format_tag::any); + *user_dst_md = mkldnn::memory::desc({ conv_dst_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; // overrides "format = isNCDHW ? ncdhw : ndhwc" + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[isNCDHW ? 0 : 0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[isNCDHW ? 1 : 4]; + user_dst_md->data.format_desc.blocking.strides[2] = dst->stridesOf()[isNCDHW ? 2 : 1]; + user_dst_md->data.format_desc.blocking.strides[3] = dst->stridesOf()[isNCDHW ? 3 : 2]; + user_dst_md->data.format_desc.blocking.strides[4] = dst->stridesOf()[isNCDHW ? 4 : 3]; + } + }; + + + void getMKLDNNMemoryDescBatchNorm(const NDArray* src, const NDArray* diff_src, const NDArray* dst, + mkldnn::memory::desc* batchnorm_src_md, mkldnn::memory::desc* batchnorm_diff_src_md, mkldnn::memory::desc* batchnorm_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis) { + const Nd4jLong* shape = src->getShapeInfo(); + Nd4jLong rank = shape[0]; + Nd4jLong dim1 = axis; // MKL-DNN supports only 1 axis, which has to be the "channel" one + Nd4jLong dim2 = axis >= 2 ? 1 : 2; + Nd4jLong dim3 = axis >= 3 ? 2 : 3; + mkldnn::memory::dims batchnorm_src_tz = { (int)shape[1], (int)shape[dim1 + 1], rank > 2 ? (int)shape[dim2 + 1] : 1, rank > 3 ? (int)shape[dim3 + 1] : 1}; + + auto type = mkldnn::memory::data_type::f32; + auto format = mkldnn::memory::format_tag::nchw; + auto supposed_to_be_any_format = mkldnn::memory::format_tag::nChw8c; // doesn't work with "any" + + if (src != nullptr && src->getBuffer() != nullptr && batchnorm_src_md != nullptr) { + *batchnorm_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); + *user_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; // overrides format + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[dim1]; + user_src_md->data.format_desc.blocking.strides[2] = rank > 2 ? src->stridesOf()[dim2] : 1; + user_src_md->data.format_desc.blocking.strides[3] = rank > 3 ? src->stridesOf()[dim3] : 1; + } + + if (diff_src != nullptr && diff_src->getBuffer() != nullptr && batchnorm_diff_src_md != nullptr) { + *batchnorm_diff_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); + *user_diff_src_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; // overrides format + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[dim1]; + user_diff_src_md->data.format_desc.blocking.strides[2] = rank > 2 ? diff_src->stridesOf()[dim2] : 1; + user_diff_src_md->data.format_desc.blocking.strides[3] = rank > 3 ? diff_src->stridesOf()[dim3] : 1; + } + + if (dst != nullptr && dst->getBuffer() != nullptr && batchnorm_dst_md != nullptr) { + *batchnorm_dst_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, supposed_to_be_any_format); + *user_dst_md = mkldnn::memory::desc({ batchnorm_src_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; // overrides format + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[dim1]; + user_dst_md->data.format_desc.blocking.strides[2] = rank > 2 ? dst->stridesOf()[dim2] : 1; + user_dst_md->data.format_desc.blocking.strides[3] = rank > 3 ? dst->stridesOf()[dim3] : 1; + } + }; + + + void getMKLDNNMemoryDescLrn(const NDArray* src, const NDArray* diff_src, const NDArray* dst, + mkldnn::memory::desc* lrn_src_md, mkldnn::memory::desc* lrn_diff_src_md, mkldnn::memory::desc* lrn_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis) { + const Nd4jLong* shape = src->getShapeInfo(); + long rank = shape[0]; + long dim1 = axis; // MKL-DNN supports only 1 axis, which has to be the "channel" one + long dim2 = axis >= 2 ? 1 : 2; + long dim3 = axis >= 3 ? 2 : 3; + mkldnn::memory::dims lrn_src_tz = { (int)shape[1], (int)shape[dim1 + 1], rank > 2 ? (int)shape[dim2 + 1] : 1, rank > 3 ? (int)shape[dim3 + 1] : 1}; + + auto type = mkldnn::memory::data_type::f32; + auto format = axis == 1 ? mkldnn::memory::format_tag::nchw : mkldnn::memory::format_tag::nhwc; + auto supposed_to_be_any_format = format; // doesn't work with "any" + + if (src != nullptr && src->getBuffer() != nullptr && lrn_src_md != nullptr) { + *lrn_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); + *user_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); + user_src_md->data.format_kind = mkldnn_blocked; + user_src_md->data.format_desc.blocking.strides[0] = src->stridesOf()[0]; + user_src_md->data.format_desc.blocking.strides[1] = src->stridesOf()[dim1]; + user_src_md->data.format_desc.blocking.strides[2] = rank > 2 ? src->stridesOf()[dim2] : 1; + user_src_md->data.format_desc.blocking.strides[3] = rank > 3 ? src->stridesOf()[dim3] : 1; + } + + if (diff_src != nullptr && diff_src->getBuffer() != nullptr && lrn_diff_src_md != nullptr) { + *lrn_diff_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); + *user_diff_src_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); + user_diff_src_md->data.format_kind = mkldnn_blocked; + user_diff_src_md->data.format_desc.blocking.strides[0] = diff_src->stridesOf()[0]; + user_diff_src_md->data.format_desc.blocking.strides[1] = diff_src->stridesOf()[dim1]; + user_diff_src_md->data.format_desc.blocking.strides[2] = rank > 2 ? diff_src->stridesOf()[dim2] : 1; + user_diff_src_md->data.format_desc.blocking.strides[3] = rank > 3 ? diff_src->stridesOf()[dim3] : 1; + } + + if (dst != nullptr && dst->getBuffer() != nullptr && lrn_dst_md != nullptr) { + *lrn_dst_md = mkldnn::memory::desc({ lrn_src_tz }, type, supposed_to_be_any_format); + *user_dst_md = mkldnn::memory::desc({ lrn_src_tz }, type, format); + user_dst_md->data.format_kind = mkldnn_blocked; + user_dst_md->data.format_desc.blocking.strides[0] = dst->stridesOf()[0]; + user_dst_md->data.format_desc.blocking.strides[1] = dst->stridesOf()[dim1]; + user_dst_md->data.format_desc.blocking.strides[2] = rank > 2 ? dst->stridesOf()[dim2] : 1; + user_dst_md->data.format_desc.blocking.strides[3] = rank > 3 ? dst->stridesOf()[dim3] : 1; + } + } + + mkldnn::engine& getEngine(void *ptr) { + auto eng = reinterpret_cast(ptr); + return *eng; + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h new file mode 100644 index 000000000..4e79974a5 --- /dev/null +++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h @@ -0,0 +1,124 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author saudet +// + +#ifndef DEV_TESTS_MKLDNNUTILS_H +#define DEV_TESTS_MKLDNNUTILS_H + +#include +#include +#include +#include +#include +#include +#include + + +namespace nd4j{ + namespace ops { + namespace platforms { + /** + * Here we actually declare our platform helpers + */ + DECLARE_PLATFORM(conv2d); + + DECLARE_PLATFORM(conv2d_bp); + + DECLARE_PLATFORM(avgpool2d); + + DECLARE_PLATFORM(avgpool2d_bp); + + DECLARE_PLATFORM(maxpool2d); + + DECLARE_PLATFORM(maxpool2d_bp); + + DECLARE_PLATFORM(conv3dnew); + + DECLARE_PLATFORM(conv3dnew_bp); + + DECLARE_PLATFORM(maxpool3dnew); + + DECLARE_PLATFORM(maxpool3dnew_bp); + + DECLARE_PLATFORM(avgpool3dnew); + + DECLARE_PLATFORM(avgpool3dnew_bp); + + DECLARE_PLATFORM(lrn); + + DECLARE_PLATFORM(batchnorm_new); + } + } + + namespace mkldnnUtils { + + /** + * Utility methods for MKLDNN + */ + void getMKLDNNMemoryDescConv2d( + int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, bool isSameMode, bool isNCHW, + int bS, int iC, int iH, int iW, int oC, int oH, int oW, const NDArray* src, const NDArray* diff_src, + const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, + mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, + mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, + mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r); + + void getMKLDNNMemoryDescConv3d( + int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, bool isSameMode, bool isNCDHW, + int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, const NDArray* src, const NDArray* diff_src, + const NDArray* weights, const NDArray* diff_weights, const NDArray* bias, const NDArray* dst, + mkldnn::memory::desc* conv_src_md, mkldnn::memory::desc* conv_diff_src_md, mkldnn::memory::desc* conv_weights_md, + mkldnn::memory::desc* conv_diff_weights_md, mkldnn::memory::desc* conv_bias_md, mkldnn::memory::desc* conv_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_weights_md, + mkldnn::memory::desc* user_diff_weights_md, mkldnn::memory::desc* user_bias_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& conv_strides, mkldnn::memory::dims& conv_padding, mkldnn::memory::dims& conv_padding_r); + + void getMKLDNNMemoryDescPool2d( + int kH, int kW, int sH, int sW, int pH, int pW, int dH, int dW, int poolingMode, int extraParam0, bool isNCHW, + int bS, int iC, int iH, int iW, int oC, int oH, int oW, + const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, + mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r); + + void getMKLDNNMemoryDescPool3d( + int kD, int kH, int kW, int sD, int sH, int sW, int pD, int pH, int pW, int dD, int dH, int dW, int poolingMode, int extraParam0, bool isNCDHW, + int bS, int iC, int iD, int iH, int iW, int oC, int oD, int oH, int oW, + const NDArray* src, const NDArray* diff_src, const NDArray* dst, mkldnn::algorithm& algorithm, + mkldnn::memory::desc* pool_src_md, mkldnn::memory::desc* pool_diff_src_md, mkldnn::memory::desc* pool_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, + mkldnn::memory::dims& pool_strides, mkldnn::memory::dims& pool_kernel, mkldnn::memory::dims& pool_padding, mkldnn::memory::dims& pool_padding_r); + + void getMKLDNNMemoryDescBatchNorm(const NDArray* src, const NDArray* diff_src, const NDArray* dst, + mkldnn::memory::desc* batchnorm_src_md, mkldnn::memory::desc* batchnorm_diff_src_md, mkldnn::memory::desc* batchnorm_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis); + + void getMKLDNNMemoryDescLrn(const NDArray* src, const NDArray* diff_src, const NDArray* dst, + mkldnn::memory::desc* lrn_src_md, mkldnn::memory::desc* lrn_diff_src_md, mkldnn::memory::desc* lrn_dst_md, + mkldnn::memory::desc* user_src_md, mkldnn::memory::desc* user_diff_src_md, mkldnn::memory::desc* user_dst_md, int axis); + + mkldnn::engine& getEngine(void *ptr); + } +} + + + +#endif //DEV_TESTS_MKLDNNUTILS_H diff --git a/libnd4j/include/platform_boilerplate.h b/libnd4j/include/platform_boilerplate.h new file mode 100644 index 000000000..d3883bcf7 --- /dev/null +++ b/libnd4j/include/platform_boilerplate.h @@ -0,0 +1,45 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SD_PLATFORM_BOILERPLATE_H +#define SD_PLATFORM_BOILERPLATE_H + + +#define DECLARE_PLATFORM(NAME) class ND4J_EXPORT PLATFORM_##NAME : public PlatformHelper {\ + public: \ + PLATFORM_##NAME() : PlatformHelper(#NAME) { } \ + bool isUsable(graph::Context &context) override; \ + Nd4jStatus invokeHelper(graph::Context &context) override; \ + }; + +#define PLATFORM_IMPL(NAME) struct ND4J_EXPORT __registratorPlatformHelper_##NAME { \ + __registratorPlatformHelper_##NAME() { \ + auto helper = new PLATFORM_##NAME(); \ + OpRegistrator::getInstance()->registerHelper(helper); \ + } \ + }; \ + static __registratorPlatformHelper_##NAME platformHelper_##NAME; \ + Nd4jStatus PLATFORM_##NAME::invokeHelper(nd4j::graph::Context &block) + + +#define PLATFORM_CHECK(NAME) bool PLATFORM_##NAME::isUsable(graph::Context &block) + + +#endif //SD_PLATFORM_BOILERPLATE_H diff --git a/libnd4j/include/types/float16.h b/libnd4j/include/types/float16.h index f75a292d4..0cc75daed 100644 --- a/libnd4j/include/types/float16.h +++ b/libnd4j/include/types/float16.h @@ -21,7 +21,7 @@ #include #include #include -#if defined(__INTEL_COMPILER) || defined(__F16C__) +#if defined(__INTEL_COMPILER) || defined(SD_F16C) #include #endif @@ -122,7 +122,7 @@ static local_def unsigned short hneg(unsigned short h) { } -#if defined(__INTEL_COMPILER) || defined(__F16C__) +#if defined(__INTEL_COMPILER) || defined(SD_F16C) //_Pragma("omp declare simd") inline local_def float cpu_ihalf2float(ihalf h) { return _cvtsh_ss(h.getX()); @@ -157,7 +157,7 @@ local_def float cpu_ihalf2float(ihalf h) { } #endif -#if defined(__INTEL_COMPILER) || defined(__F16C__) +#if defined(__INTEL_COMPILER) || defined(SD_F16C) //_Pragma("omp declare simd") inline local_def ihalf cpu_float2ihalf_rn(float f) { ihalf ret; diff --git a/libnd4j/pom.xml b/libnd4j/pom.xml index be3136fc5..740fe4774 100644 --- a/libnd4j/pom.xml +++ b/libnd4j/pom.xml @@ -74,6 +74,7 @@ ${libnd4j.platform} + @@ -175,6 +176,8 @@ ${libnd4j.tests} -j ${libnd4j.buildthreads} + -h + ${libnd4j.helper} ${project.basedir} @@ -391,5 +394,30 @@ + + + libnd4j-helper-avx2 + + + libnd4j.extension + avx2 + + + + mkldnn + + + + libnd4j-helper-avx512 + + + libnd4j.extension + avx512 + + + + mkldnn + + diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt index 6aa483ef3..6f964d0ac 100644 --- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt @@ -131,7 +131,7 @@ endforeach(TMP_PATH) if (CPU_BLAS) add_executable(runtests ${TEST_SOURCES}) - target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} gtest gtest_main) + target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main) elseif(CUDA_BLAS) CUDA_ADD_EXECUTABLE(runtests ${TEST_SOURCES}) target_link_libraries(runtests ${LIBND4J_NAME} ${CUDA_LIBRARIES} gtest gtest_main) diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp index d8cf86495..ff36c9cca 100644 --- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp @@ -32,6 +32,10 @@ #include #include +#ifdef HAVE_MKLDNN +#include +#endif + using namespace nd4j; using namespace nd4j::graph; diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp new file mode 100644 index 000000000..03f37000e --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp @@ -0,0 +1,70 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include + +#ifdef HAVE_MKLDNN + +#include + +#endif + +class MklDnnTests : public testing::Test { +public: + +}; + +static void printer(std::initializer_list helpers) { + + for (auto v:helpers) { + nd4j_printf("Initialized [%s]\n", v->name().c_str()); + } +} + + +TEST_F(MklDnnTests, helpers_includer) { + // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker +#ifdef HAVE_MKLDNN + nd4j::ops::platforms::PLATFORM_conv2d conv2d; + nd4j::ops::platforms::PLATFORM_conv2d_bp conv2d_bp; + + nd4j::ops::platforms::PLATFORM_conv2d conv3d; + nd4j::ops::platforms::PLATFORM_conv2d_bp conv3d_bp; + + nd4j::ops::platforms::PLATFORM_avgpool2d avgpool2d; + nd4j::ops::platforms::PLATFORM_avgpool2d_bp avgpool2d_bp; + + nd4j::ops::platforms::PLATFORM_maxpool2d maxpool2d; + nd4j::ops::platforms::PLATFORM_maxpool2d_bp maxpool2d_bp; + + nd4j::ops::platforms::PLATFORM_avgpool3dnew avgpool3d; + nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp avgpool3d_bp; + + nd4j::ops::platforms::PLATFORM_maxpool3dnew maxpool3d; + nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp maxpool3d_bp; + + nd4j::ops::platforms::PLATFORM_lrn lrn; + nd4j::ops::platforms::PLATFORM_batchnorm_new batchnorm; + + printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm}); +#endif +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt index 1ac373676..c55dc884e 100644 --- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt @@ -21,7 +21,7 @@ endif() # OPTIONAL MKL-DNN if ("${BUILD_MKLDNN}") # Download and unpack mkl-dnn at configure time - configure_file(./CMakeLists.txt.in mkldnn-download/CMakeLists.txt) + configure_file(../../CMakeLists.txt.mkldnn.in mkldnn-download/CMakeLists.txt) execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-download ) @@ -40,7 +40,7 @@ if ("${BUILD_MKLDNN}") EXCLUDE_FROM_ALL) set(mkldnn_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src) set(HAVE_MKLDNN 1) - add_definitions(-DHAVE_MKLDNN=true) + add_definitions("-DHAVE_MKLDNN") include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_SOURCE_DIR}/external/mklml_lnx_2019.0.3.20190220/include ${mkldnn_SOURCE_DIR}) set(MKLDNN mkldnn) endif() @@ -131,7 +131,7 @@ else() endif() if (${F16C}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c -D__F16C__=true") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c -DSD_F16C=true") endif() endif() @@ -177,6 +177,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION} message(FATAL_ERROR "You need at least GCC 4.9") endif() +message("Looking for OpenMP") find_package(OpenMP) if (OPENMP_FOUND) set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") @@ -185,10 +186,11 @@ else() message("OPENMP NOT FOUND") endif() -if ("${OPENBLAS}" OR CMAKE_BUILD_TYPE STREQUAL "Release") - find_package(BLAS) +if ("${OPENBLAS}" OR CMAKE_BUILD_TYPE STREQUAL "Release" OR "${BUILD_MKLDNN}") + message("Looking for BLAS") + find_package(BLAS REQUIRED) if (BLAS_FOUND) - message("Found external BLAS implementation...") + message("Found external BLAS library: ${BLAS_LIBRARIES}") add_definitions(-D__EXTERNAL_BLAS__=true) endif() endif() @@ -201,13 +203,18 @@ file(GLOB_RECURSE ARRAY_SOURCES false ../../include/array/*.cpp ../../include/ar file(GLOB_RECURSE MEMORY_SOURCES false ../../include/memory/*.cpp ../../include/memory/*.h) file(GLOB_RECURSE GRAPH_SOURCES false ../../include/graph/*.cpp ../../include/graph/*.h) file(GLOB_RECURSE CUSTOMOPS_SOURCES false ../../include/ops/declarable/generic/*.cpp) -file(GLOB_RECURSE CUSTOMOPS_HELPERS_SOURCES false ../../include/ops/declarable/helpers/cpu/*.cpp ../../include/ops/declarable/helpers/impl/*.cpp) +file(GLOB_RECURSE CUSTOMOPS_GENERIC_SOURCES false ../../include/ops/declarable/helpers/cpu/*.cpp ../../include/ops/declarable/helpers/impl/*.cpp) file(GLOB_RECURSE OPS_SOURCES false ../../include/ops/impl/*.cpp ../../include/ops/declarable/impl/*.cpp ../../include/ops/*.h) file(GLOB_RECURSE INDEXING_SOURCES false ../../include/indexing/*.cpp ../../include/indexing/*.h) -file(GLOB_RECURSE HELPERS_SOURCES false ../../include/helpers/*.cpp ../../include/helpers/*.h) +file(GLOB_RECURSE HELPERS_SOURCES false ../../include/helpers/*.cpp) file(GLOB_RECURSE LOOPS_SOURCES false ../../include/loops/*.cpp ../../include/loops/*.h) -message("CPU BLAS") +# optionally build mkldnn +if ("${BUILD_MKLDNN}") + file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../../include/ops/declarable/platform/mkldnn/*.cpp) +endif() + +message("CPU backend") add_definitions(-D__CPUBLAS__=true) if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(MINGW) AND NOT(APPLE)) @@ -216,8 +223,37 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(MINGW) AND NOT(APPLE)) endif() +# this function strips path from file name, basically making up short file name, i.e. file.cpp +function(SHORTNAME LONG_NAME OUTPUT) + SET(_TMP_STR "") + string (REGEX REPLACE ".*/" "" _TMP_STR "${LONG_NAME}") + set (${OUTPUT} "${_TMP_STR}" PARENT_SCOPE) +endfunction() + +# now we ned to join two lists +# first of all we'll build truncated list of files in platform sources +# and list of priority implementations from platform helpers +#set(CUSTOMOPS_HELPERS_SOURCES "") +#set(SHORT_NAMES "") +#foreach(LONG_NAME ${CUSTOMOPS_PLATFORM_SOURCES}) +# SHORTNAME("${LONG_NAME}" "SHORT_NAME") +# set(CUSTOMOPS_HELPERS_SOURCES ${CUSTOMOPS_HELPERS_SOURCES} ${LONG_NAME}) +# set(SHORT_NAMES ${SHORT_NAMES} ${SHORT_NAME}) +#endforeach() + +# now we're going to filter generic helpers, to exclude platform implementations +#foreach(LONG_NAME ${CUSTOMOPS_GENERIC_SOURCES}) +# SHORTNAME("${LONG_NAME}" "SHORT_NAME") + + # and now we add this op ONLY if it wasn't announced in platform helpers +# string(FIND "${SHORT_NAMES}" "${SHORT_NAME}" "LOC") +# if (${LOC} EQUAL -1) +# set(CUSTOMOPS_HELPERS_SOURCES ${CUSTOMOPS_HELPERS_SOURCES} ${LONG_NAME}) +# endif() +#endforeach() + + file(GLOB_RECURSE TEST_SOURCES false ../layers_tests/*.cpp ../layers_tests/*.h) -# file(GLOB_RECURSE TEST_SOURCES false ../layers_tests/DeclarableOpsTests6.cpp ../layers_tests/*.h) # Filter out any source files from */CMakeFiles/* paths. these tend to cause problems such a multiple main definitions. @@ -234,7 +270,7 @@ add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas ../../blas/cpu/NativeOpExecutioner.cpp ../../blas/cpu/NDArray.cpp ../../blas/cpu/NDArrayFactory.cpp ../../include/cnpy/cnpy.cpp ../../include/nd4jmemset.h ../../include/nd4jmalloc.h ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} - ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES} + ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES}) target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES}) diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java index e694587b0..a060232db 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java @@ -1153,4 +1153,10 @@ public interface NativeOps { String lastErrorMessage(); boolean isBlasVersionMatches(int major, int minor, int build); + + int binaryLevel(); + int optimalLevel(); + + boolean isMinimalRequirementsMet(); + boolean isOptimalRequirementsMet(); } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java index 8f95fe5cb..67e8b4838 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java @@ -68,6 +68,7 @@ import org.bytedeco.javacpp.tools.InfoMapper; //"op_boilerplate.h", "ops/InputType.h", "ops/declarable/OpDescriptor.h", + "ops/declarable/PlatformHelper.h", "ops/declarable/BroadcastableOp.h", "helpers/OpArgsHolder.h", "ops/declarable/DeclarableOp.h", diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml index 3b008222d..c6017e3a7 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml @@ -68,17 +68,6 @@ ${mkl.version}-${javacpp-presets.version} ${dependency.platform2} - - org.bytedeco - mkl-dnn - ${mkl-dnn.javacpp.version} - - - org.bytedeco - mkl-dnn - ${mkl-dnn.javacpp.version} - ${dependency.platform2} - org.nd4j libnd4j @@ -204,17 +193,6 @@ ${openblas.version}-${javacpp-presets.version} ${dependency.platform} - - org.bytedeco - mkl-dnn - ${mkl-dnn.javacpp.version} - - - org.bytedeco - mkl-dnn - ${mkl-dnn.javacpp.version} - ${dependency.platform2} - ${javacpp.platform.properties} @@ -257,19 +235,15 @@ ${libnd4jhome}/blasbuild/cpu/blas /${javacpp.platform.library.path}/ - /org/bytedeco/mkldnn/${javacpp.platform}/ /org/bytedeco/openblas/${javacpp.platform}/ /${javacpp.platform.library.path}/include/ - /org/bytedeco/mkldnn/${javacpp.platform}/include/ /org/bytedeco/openblas/${javacpp.platform}/include/ /${javacpp.platform.library.path}/ /${javacpp.platform.library.path}/lib/ - /org/bytedeco/mkldnn/${javacpp.platform}/ - /org/bytedeco/mkldnn/${javacpp.platform}/ /org/bytedeco/openblas/${javacpp.platform}/ /org/bytedeco/openblas/${javacpp.platform}/lib/ diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java index 7cd4101ef..ff4ac6bcc 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java @@ -20,6 +20,7 @@ package org.nd4j.linalg.cpu.nativecpu; import lombok.extern.slf4j.Slf4j; import lombok.val; import org.nd4j.base.Preconditions; +import org.nd4j.config.ND4JEnvironmentVars; import org.nd4j.config.ND4JSystemProperties; import org.nd4j.linalg.api.buffer.*; import org.nd4j.linalg.api.ops.custom.Flatten; @@ -90,6 +91,38 @@ public class CpuNDArrayFactory extends BaseNativeNDArrayFactory { System.setProperty(ND4JSystemProperties.ND4J_CPU_LOAD_OPENBLAS_NOLAPACK, "mklml"); } + // we'll check hardware support first + if (!nativeOps.isMinimalRequirementsMet()) { + // this means cpu binary was built for some arch support, we don't have on this box + + val binaryLevel = nativeOps.binaryLevel(); + val optimalLevel = nativeOps.optimalLevel(); + + String binLevel = cpuBinaryLevelToName(binaryLevel); + String optLevel = cpuBinaryLevelToName(optimalLevel); + + log.warn("*********************************** CPU Feature Check Failed ***********************************"); + log.error("Error initializing ND4J: Attempting to use " + binLevel + " ND4J binary on a CPU with only " + optLevel + " support"); + log.error( binLevel + " binaries cannot be run on a CPU without these instructions. See deeplearning4j.org/cpu for more details"); + log.error("ND4J will now exit."); + log.warn("************************************************************************************************"); + System.exit(1); + } + + if (!nativeOps.isOptimalRequirementsMet() && !Boolean.parseBoolean(System.getenv(ND4JEnvironmentVars.ND4J_IGNORE_AVX))) { + val binaryLevel = nativeOps.binaryLevel(); + val optimalLevel = nativeOps.optimalLevel(); + + String binLevel = cpuBinaryLevelToName(binaryLevel); + String optLevel = cpuBinaryLevelToName(optimalLevel); + + log.warn("*********************************** CPU Feature Check Warning ***********************************"); + log.warn("Warning: Initializing ND4J with " + binLevel + " binary on a CPU with " + optLevel + " support"); + log.warn("Using ND4J with " + optLevel + " will improve performance. See deeplearning4j.org/cpu for more details"); + log.warn("Or set environment variable " + ND4JEnvironmentVars.ND4J_IGNORE_AVX + "=true to suppress this warning"); + log.warn("************************************************************************************************"); + } + blas = new CpuBlas(); // TODO: add batched gemm here @@ -111,6 +144,19 @@ public class CpuNDArrayFactory extends BaseNativeNDArrayFactory { throw new RuntimeException(nativeOps.lastErrorMessage()); } + private static String cpuBinaryLevelToName(int level){ + switch (level){ + case 3: + return "AVX512"; + case 2: + return "AVX/AVX2"; + case 1: + case 0: + default: + return "Generic x86"; + } + } + @Override public void createLevel1() { level1 = new CpuLevel1(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index dabac7001..8d92e09ad 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -3120,6 +3120,13 @@ public native @Cast("Nd4jPointer") Pointer lcCopyStream(OpaqueLaunchContext lc); public native @Cast("Nd4jPointer") Pointer lcBlasHandle(OpaqueLaunchContext lc); public native @Cast("Nd4jPointer") Pointer lcSolverHandle(OpaqueLaunchContext lc); + +public native int binaryLevel(); +public native int optimalLevel(); + +public native @Cast("bool") boolean isMinimalRequirementsMet(); +public native @Cast("bool") boolean isOptimalRequirementsMet(); + // #endif //NATIVEOPERATIONS_NATIVEOPS_H @@ -6578,9 +6585,6 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) { // #include // #include -// #ifdef HAVE_MKLDNN -// #endif - // CUDA-specific includes // #ifdef __CUDACC__ // #endif @@ -6647,8 +6651,6 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) { public native int getBranch(); public native void setBranch(int branch); -// #ifdef HAVE_MKLDNN -// #endif /** * * @return @@ -11607,6 +11609,81 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #endif //LIBND4J_OPDESCRIPTOR_H +// Parsed from ops/declarable/PlatformHelper.h + +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +// #ifndef SD_PLATFORMHELPER_H +// #define SD_PLATFORMHELPER_H + +// #include +// #include +// #include +// #include +// #include + /** + * This abstract class defines methods used by platform-specific helpers implementations + */ + @Namespace("nd4j::ops::platforms") @NoOffset public static class PlatformHelper extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public PlatformHelper(Pointer p) { super(p); } + + + public native @StdString BytePointer name(); + + public native @Cast("Nd4jLong") long hash(); + + /** + * This method checks, if given helper can be used with given input/output/configuration options + * + * @param context + * @return + */ + public native @Cast("bool") boolean isUsable(@ByRef Context context); + + /** + * This method invokes helper. Typically this method replaces actual op execution + * + * @param context + * @return + */ + public native @Cast("Nd4jStatus") int invokeHelper(@ByRef Context context); + + /** + * Helper method, needed for compatibility with DeclarableOp macros + * @param ctx + * @param inputId + * @return + */ + public native NDArray getZ(@ByRef Context ctx, int inputId); + } + + + + + +// #endif //SD_PLATFORMHELPER_H + + // Parsed from ops/declarable/BroadcastableOp.h /******************************************************************************* @@ -12080,6 +12157,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #include // #include // #include +// #include // handlers part // #include @@ -12107,7 +12185,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public native @Cast("char*") String getAllCustomOperations(); /** - * This method registers operation + * This method registers operation in our registry, so we can use them later * * @param op */ @@ -12115,10 +12193,16 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public native @Cast("bool") boolean registerOperation(@Cast("char*") BytePointer name, DeclarableOp op); public native @Cast("bool") boolean registerOperation(DeclarableOp op); + public native void registerHelper(PlatformHelper op); + + public native @Cast("bool") boolean hasHelper(@Cast("Nd4jLong") long hash); + public native DeclarableOp getOperation(@Cast("char*") String name); public native DeclarableOp getOperation(@Cast("char*") BytePointer name); public native DeclarableOp getOperation(@Cast("Nd4jLong") long hash); + public native PlatformHelper getPlatformHelper(@Cast("Nd4jLong") long hash); + public native @Cast("Nd4jLong*") @StdVector LongPointer getAllHashes(); public native int numberOfOperations(); @@ -15405,7 +15489,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #endif /** - * This operation unstacks given NDArray into NDArrayList + * This operation unstacks given NDArray into NDArrayList by the first dimension */ // #if NOT_EXCLUDED(OP_unstack_list) @Namespace("nd4j::ops") public static class unstack_list extends DeclarableListOp { @@ -18168,9 +18252,21 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); /** + * This operation rearranges data from depth into blocks of spatial data. This is the reverse transformation + * of space_to_depth op. This op output is a copy of the input tensor where values from the depth dimension + * are moved in spatial blocks to the height and width dimensions. Int attr 0 indicates the input + * block size and how the data is moved. + * Input: + * 0 - 4D tensor on given type + * Output: + * 0 - 4D tensor of given type and proper shape * - * - * + * Int arguments: + * 0 - block size + * 1 - output data format: 0 ("NHWC"): shape{ batch, height, width, channels } + * 1 ("NCHW"): shape{ batch, channels, height, width } + * 2 ("NCHW_VECT_C"): int8 shape{ batch, channels / 4, height, width, 4 } + * optional (default 0) */ // #if NOT_EXCLUDED(OP_depth_to_space) @Namespace("nd4j::ops") public static class depth_to_space extends DeclarableCustomOp { @@ -18191,8 +18287,21 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #endif /** + * This operation rearranges blocks of spatial data, into depth.This op output is a copy of the input tensor + * where values from the height and width dimensions are moved to the depth dimension. Int attr 0 indicates + * the input block size. * + * Input: + * - 4D tensor of given type + * Output: + * - 4D tensor * + * Int arguments: + * 0 - block size + * 1 - output data format: 0 ("NHWC"): shape{ batch, height, width, channels } + * 1 ("NCHW"): shape{ batch, channels, height, width } + * 2 ("NCHW_VECT_C"): int8 shape{ batch, channels / 4, height, width, 4 } + * optional (default 0) * */ // #if NOT_EXCLUDED(OP_space_to_depth) @@ -18238,7 +18347,20 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #endif /** + * Zero-pads and then rearranges (permutes) blocks of spatial data into batch. More specifically, this op + * outputs a copy of the input tensor where values from the height and width dimensions are moved to the + * batch dimension. After the zero-padding, both height and width of the input must be divisible by the block + * size. * + * Inputs: + * 0 - input tensor + * 1 - 2D paddings tensor (shape {M, 2}) + * + * Output: + * - result tensor + * + * Int args: + * 0 - block size (M) * */ // #if NOT_EXCLUDED(OP_space_to_batch) @@ -18259,6 +18381,22 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); } // #endif + /* + * This operation divides "spatial" dimensions [1, ..., M] of the input into a grid of blocks of shape + * block_shape, and interleaves these blocks with the "batch" dimension (0) such that in the output, + * the spatial dimensions [1, ..., M] correspond to the position within the grid, and the batch dimension + * combines both the position within a spatial block and the original batch position. Prior to division into + * blocks, the spatial dimensions of the input are optionally zero padded according to paddings. + * + * Inputs: + * 0 - input (N-D tensor) + * 1 - block_shape - int 1D tensor with M length + * 2 - paddings - int 2D tensor with shape {M, 2} + * + * Output: + * - N-D tensor with the same type as input 0. + * + * */ // #if NOT_EXCLUDED(OP_space_to_batch_nd) @Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp { static { Loader.load(); } @@ -19009,7 +19147,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); * return value: * tensor with min values according to indices sets. */ -// #if NOT_EXCLUDED(OP_segment_min_bp) +// #if NOT_EXCLUDED(OP_segment_min) @Namespace("nd4j::ops") public static class segment_min extends DeclarableCustomOp { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -23075,6 +23213,10 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #ifdef __CUDABLAS__ // #endif +// used for MKLDNN etc +// #if !defined(__STANDALONE_BUILD__) +// #include "config.h" +// #endif // #include // #include @@ -23106,6 +23248,8 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public native Workspace getWorkspace(); public native void setWorkspace(Workspace theWorkspace); + public native Pointer engine(); + public native int getDeviceID(); public native void setDeviceID(int deviceID); public native ErrorReference errorReference(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java index 554016686..4a99bbe3e 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java @@ -76,6 +76,7 @@ import java.util.Scanner; //"op_enums.h", "ops/InputType.h", "ops/declarable/OpDescriptor.h", + "ops/declarable/PlatformHelper.h", "ops/declarable/BroadcastableOp.h", "ops/declarable/DeclarableOp.h", "ops/declarable/DeclarableListOp.h", @@ -127,9 +128,9 @@ import java.util.Scanner; "cnpy/cnpy.h" }, compiler = {"cpp11", "nowarnings"}, library = "jnind4jcpu", link = "nd4jcpu", - preloadresource = {"org/bytedeco/mkldnn/", "org/bytedeco/openblas/"}, + preloadresource = {"org/bytedeco/openblas/"}, preload = {"openblas", "openblas_nolapack", "libnd4jcpu"}), - @Platform(value = "linux", preload = {"gomp@.1", "iomp5", "mklml_intel", "mkldnn@.0"}, + @Platform(value = "linux", preload = {"gomp@.1"}, preloadpath = {"/lib64/", "/lib/", "/usr/lib64/", "/usr/lib/"}), @Platform(value = {"linux-arm", "linux-ppc"}, preload = {"gomp@.1", "gcc_s@.1", "quadmath@.0", "gfortran@.5", "gfortran@.4", "gfortran@.3", "openblas@.0", "libnd4jcpu"}), @@ -137,10 +138,10 @@ import java.util.Scanner; @Platform(value = "linux-arm64", preloadpath = {"/usr/aarch64-linux-gnu/lib/", "/usr/lib/aarch64-linux-gnu/"}), @Platform(value = "linux-ppc64", preloadpath = {"/usr/powerpc64-linux-gnu/lib/", "/usr/powerpc64le-linux-gnu/lib/", "/usr/lib/powerpc64-linux-gnu/", "/usr/lib/powerpc64le-linux-gnu/"}), - @Platform(value = "macosx", preload = {"gcc_s@.1", "gomp@.1", "stdc++@.6", "iomp5", "mklml", "mkldnn@.0"}, + @Platform(value = "macosx", preload = {"gcc_s@.1", "gomp@.1", "stdc++@.6"}, preloadpath = {"/usr/local/lib/gcc/8/", "/usr/local/lib/gcc/7/", "/usr/local/lib/gcc/6/", "/usr/local/lib/gcc/5/"}), @Platform(value = "windows", preload = {"libwinpthread-1", "libgcc_s_seh-1", "libgomp-1", "libstdc++-6", - "msvcr120", "libiomp5md", "mklml", "libmkldnn", "libnd4jcpu"}), + "msvcr120", "libnd4jcpu"}), @Platform(extension = {"-avx512", "-avx2"}) }) public class Nd4jCpuPresets implements InfoMapper, BuildEnabled { diff --git a/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java b/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java index fca2bdcc7..3bcff03f0 100644 --- a/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java +++ b/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java @@ -131,6 +131,12 @@ public class ND4JEnvironmentVars { public static final String ND4J_RESOURCES_CACHE_DIR = "ND4J_RESOURCES_CACHE_DIR"; + /** + * Applicability: nd4j-native
+ * Description: Set to true to avoid logging AVX warnings (i.e., running generic x86 binaries on an AVX2 system) + */ + public static final String ND4J_IGNORE_AVX = "ND4J_IGNORE_AVX"; + private ND4JEnvironmentVars() { } }