cuDNN integration (#150)

* initial commit

Signed-off-by: raver119 <raver119@gmail.com>

* one file

Signed-off-by: raver119 <raver119@gmail.com>

* few more includes

Signed-off-by: raver119 <raver119@gmail.com>

* m?

Signed-off-by: raver119 <raver119@gmail.com>

* const

Signed-off-by: raver119 <raver119@gmail.com>

* cudnn linkage in tests

Signed-off-by: raver119 <raver119@gmail.com>

* culibos

Signed-off-by: raver119 <raver119@gmail.com>

* static reminder

Signed-off-by: raver119 <raver119@gmail.com>

* platform engine tag

Signed-off-by: raver119 <raver119@gmail.com>

* HAVE_CUDNN moved to config.h.in

Signed-off-by: raver119 <raver119@gmail.com>

* include

Signed-off-by: raver119 <raver119@gmail.com>

* include

Signed-off-by: raver119 <raver119@gmail.com>

* skip cudnn handle creation if there's not cudnn

Signed-off-by: raver119 <raver119@gmail.com>

* meh

Signed-off-by: raver119 <raver119@gmail.com>

* target device in context

Signed-off-by: raver119 <raver119@gmail.com>

* platform engines

Signed-off-by: raver119 <raver119@gmail.com>

* platform engines

Signed-off-by: raver119 <raver119@gmail.com>

* allow multiple -h args

Signed-off-by: raver119 <raver119@gmail.com>

* allow multiple -h args

Signed-off-by: raver119 <raver119@gmail.com>

* move mkldnn out of CPU block

Signed-off-by: raver119 <raver119@gmail.com>

* link to mkldnn on cuda

Signed-off-by: raver119 <raver119@gmail.com>

* less prints

Signed-off-by: raver119 <raver119@gmail.com>

* minor tweaks

Signed-off-by: raver119 <raver119@gmail.com>

* next step

Signed-off-by: raver119 <raver119@gmail.com>

* conv2d NCHW draft

Signed-off-by: raver119 <raver119@gmail.com>

* conv2d biasAdd

Signed-off-by: raver119 <raver119@gmail.com>

* test for MKL/CUDNN combined use

Signed-off-by: raver119 <raver119@gmail.com>

* - provide additional code for conv2d ff based on cudnn api, not tested yet

Signed-off-by: Yurii <iuriish@yahoo.com>

* - further work on conv2d helper based on using cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - fixing several cuda bugs which appeared after cudnn lib had been started to use

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of conv2d backprop op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementaion of conv3d and conv3d_bp ops based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - bugs fixing in conv3d/conv3d_bp ops (cudnn in use)

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of depthwiseConv2d (ff/bp) op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - implementation of batchnorm ff op based on cudnn api

Signed-off-by: Yurii <iuriish@yahoo.com>

* - disable cudnn batchnorm temporary

Signed-off-by: Yurii <iuriish@yahoo.com>

* - add minor change in cmake

Signed-off-by: Yurii <iuriish@yahoo.com>

* engine for depthwise mkldnn

Signed-off-by: raver119 <raver119@gmail.com>

* couple of includes

Signed-off-by: raver119 <raver119@gmail.com>

* - provide permutation to cudnn batchnorm ff when format is NHWC

Signed-off-by: Yurii <iuriish@yahoo.com>

* lgamma fix

Signed-off-by: raver119 <raver119@gmail.com>

* - eliminate memory leak in two tests

Signed-off-by: Yurii <iuriish@yahoo.com>

Co-authored-by: Yurii Shyrma <iuriish@yahoo.com>
master
raver119 2020-01-20 21:32:46 +03:00 committed by GitHub
parent 8fc0e63ce7
commit 7783012f39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
65 changed files with 2810 additions and 435 deletions

View File

@ -14,6 +14,10 @@ set(CMAKE_CXX_STANDARD 11)
if (CUDA_BLAS)
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 11)
set(DEFAULT_ENGINE "samediff::ENGINE_CUDA")
else()
set(DEFAULT_ENGINE "samediff::ENGINE_CPU")
endif()
# MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
@ -60,6 +64,7 @@ if(NATIVE)
ENDIF()
endif()
if(NOT CUDA_BLAS)
# we need this definition to avoid global memory use within mkldnn
add_definitions(-DDNNL_ENABLE_CONCURRENT_EXEC=true)
@ -128,9 +133,11 @@ if(NOT CUDA_BLAS)
include_directories(${CPUF_SOURCE_DIR}/include)
set(CPU_FEATURES cpu_features)
endif()
endif()
# new mkl-dnn entry
if (${HELPERS_mkldnn})
# new mkl-dnn entry
if (${HELPERS_mkldnn})
message("Going to pull & build mkldnn")
set(HAVE_MKLDNN 1)
set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "Hack to enforce static mode" FORCE)
@ -158,6 +165,38 @@ if(NOT CUDA_BLAS)
set(MKLDNN_PATH "${mkldnn_SOURCE_DIR}")
include_directories(${mkldnn_SOURCE_DIR}/include ${mkldnn_EXT_DIR}/include ${mkldnn_SOURCE_DIR})
set(MKLDNN dnnl)
endif()
if (${HELPERS_cudnn})
if (NOT CUDA_BLAS)
message(FATAL_ERROR "Can't build cuDNN on non-CUDA platform")
endif()
set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
# FIXME: we don't want static library in master
SET(CUDNN_LIBNAME "cudnn")
SET(CULIBOS_LIBNAME "culibos")
find_path(CUDNN_INCLUDE_DIR cudnn.h
HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES cuda/include include)
find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
find_library(CULIBOS_LIBRARY ${CULIBOS_LIBNAME}
HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
if (CUDNN_LIBRARY)
set(HAVE_CUDNN true)
set(CUDNN ${CUDNN_LIBRARY} ${CULIBOS_LIBRARY})
else()
message(FATAL_ERROR "Unable to find cuDNN")
endif()
endif()
@ -185,6 +224,8 @@ set(HAVE_FLATBUFFERS 1)
set(FLATBUFFERS_PATH ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src)
include_directories(${FLATBUFFERS_PATH}/include)
configure_file(include/config.h.in include/config.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)

View File

@ -131,6 +131,11 @@ if(!CUDA_BLAS)
endif()
endif()
#if MKLDNN is enabled - we're building mkldnn-powered helpers
if (HAVE_MKLDNN)
file(GLOB_RECURSE CUSTOMOPS_MKLDNN_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
endif()
if(CUDA_BLAS)
message("Build cublas")
find_package(CUDA)
@ -210,16 +215,23 @@ if(CUDA_BLAS)
file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)
if (HAVE_CUDNN)
message("cuDNN included")
file(GLOB_RECURSE CUSTOMOPS_CUDNN_SOURCES false ../include/ops/declarable/platform/cudnn/*.cu)
endif()
add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})
add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
if (WIN32)
message("MSVC runtime for library: ${MSVC_RT_LIB}")
endif()
# static library is built only if we're going to build tests, skip otherwise
if (BUILD_TESTS)
@ -237,7 +249,7 @@ if(CUDA_BLAS)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
endif()
target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY})
target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)
install(TARGETS ${LIBND4J_NAME} DESTINATION .)
@ -264,12 +276,6 @@ elseif(CPU_BLAS)
file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/*.cpp ../include/helpers/*.h)
file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/*.cpp ../include/loops/*.h)
#if MKLDNN is enabled - we're building mkldnn-powered helpers
if (HAVE_MKLDNN)
file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
endif()
if (X86_BUILD)
# we disable platform optimizations for certains files for linux/macos
set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
@ -282,7 +288,7 @@ elseif(CPU_BLAS)
cpu/NativeOpExecutioner.cpp cpu/NDArray.cpp cpu/NDArrayFactory.cpp
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
${OPS_SOURCES} ${PERF_SOURCES})
if(IOS)
add_library(${LIBND4J_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>)

View File

@ -940,6 +940,7 @@ namespace nd4j {
template <typename T>
std::vector<T> getBufferAsVector();
std::vector<Nd4jLong> getShapeAsVector() const;
std::vector<int> getShapeAsVectorInt() const;
std::vector<Nd4jLong> getShapeInfoAsVector();
std::vector<int64_t> getShapeInfoAsFlatVector();
std::vector<int64_t> getShapeAsFlatVector();

View File

@ -444,6 +444,16 @@ std::vector<Nd4jLong> NDArray::getShapeAsVector() const {
return vector;
}
////////////////////////////////////////////////////////////////////////
std::vector<int> NDArray::getShapeAsVectorInt() const {
std::vector<int> vector(this->rankOf());
for (int e = 0; e < this->rankOf(); e++)
vector[e] = static_cast<int>(this->sizeAt(e));
return vector;
}
////////////////////////////////////////////////////////////////////////
std::vector<int64_t> NDArray::getShapeInfoAsFlatVector() {
int magicNumber = shape::shapeInfoLength(this->rankOf());
@ -625,7 +635,7 @@ void NDArray::assign(const NDArray& other, bool allowParallelism) {
if (other.lengthOf() != lengthOf()) {
auto shapeThis = ShapeUtils::shapeAsString(this);
auto shapeThat = ShapeUtils::shapeAsString(&other);
nd4j_printf("Can't assign new value to the array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
nd4j_printf("Can't assign array: this shape %s; other shape: %s\n", shapeThis.c_str(), shapeThat.c_str());
throw std::runtime_error("NDArray::assign: lengths of arrays are mismatched");
}

View File

@ -488,7 +488,7 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext *lc,
throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 8192);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);
BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
@ -523,7 +523,7 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext *lc,
auto xRank = shape::rank(hXShapeInfo);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
@ -559,7 +559,7 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext *lc,
auto xRank = shape::rank(hXShapeInfo);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
@ -601,7 +601,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext *lc,
auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
if (zType != nd4j::DataType::INT64 && zType != nd4j::DataType::INT32)
throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
@ -647,7 +647,7 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext *lc,
auto xRank = shape::rank(hXShapeInfo);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
@ -684,7 +684,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
if (nd4j::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
printf("AF1 opNum:[%i]\n", opNum);
@ -734,7 +734,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
@ -766,7 +766,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);
@ -797,7 +797,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);
@ -828,7 +828,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);
@ -1085,7 +1085,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext *lc,
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
if (xType != yType)
throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
@ -1135,7 +1135,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext *lc,
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
dX, dXShapeInfo,
@ -1177,7 +1177,7 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext *lc,
auto xLength = shape::length(hXShapeInfo);
auto blockWidth = 256;
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
dim3 launchDims(numBlocks, blockWidth, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
if (xType != yType)
throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
@ -1595,7 +1595,7 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext *lc,
throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
auto numBlocks = shape::length(hZShapeInfo);
dim3 launchDims(numBlocks, 256, 32768);
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);

View File

@ -489,6 +489,7 @@ mkbuilddir() {
cd "blasbuild/$CHIP"
}
HELPERS=""
if [ "$HELPER" == "" ]; then
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
echo "!! !!"
@ -503,6 +504,14 @@ if [ "$HELPER" == "" ]; then
echo "!! !!"
echo "!! !!"
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
else
# if helpers were defined, we'll propagate them to CMake
IFS=','
read -ra HLP <<< "$HELPER"
for i in "${HLP[@]}"; do
HELPERS="${HELPERS} -DHELPERS_$i=true"
done
IFS=' '
fi
echo PACKAGING = "${PACKAGING}"
@ -519,10 +528,10 @@ echo MINIFIER = "${MINIFIER_ARG}"
echo TESTS = "${TESTS_ARG}"
echo NAME = "${NAME_ARG}"
echo OPENBLAS_PATH = "$OPENBLAS_PATH"
echo HELPERS = "$HELPER"
echo HELPERS = "$HELPERS"
mkbuilddir
pwd
eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DHELPERS_"$HELPER"=true "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
if [ "$PARALLEL" == "true" ]; then
MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
fi

View File

@ -13,4 +13,8 @@
#cmakedefine FLATBUFFERS_PATH "@FLATBUFFERS_PATH@"
#cmakedefine HAVE_CUDNN
#cmakedefine DEFAULT_ENGINE @DEFAULT_ENGINE@
#endif

View File

@ -0,0 +1,31 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef SD_ENGINE_H
#define SD_ENGINE_H
namespace samediff {
enum Engine {
ENGINE_CPU = 0,
ENGINE_CUDA = 1,
};
}
#endif //SD_ENGINE_H

View File

@ -18,8 +18,8 @@
// @author raver119@gmail.com
//
#ifndef DEV_TESTS_EXECUTOR_H
#define DEV_TESTS_EXECUTOR_H
#ifndef SD_EXECUTOR_H
#define SD_EXECUTOR_H
namespace nd4j {
class Executor {
@ -30,4 +30,4 @@ namespace nd4j {
};
}
#endif //DEV_TESTS_EXECUTOR_H
#endif //SD_EXECUTOR_H

View File

@ -27,6 +27,7 @@
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include "config.h"
#endif
// used for MKLDNN etc
@ -81,6 +82,7 @@ class ND4J_EXPORT LaunchContext {
int* getAllocationPointer() const;
void* getCublasHandle() const;
void* getCusolverHandle() const;
void* getCuDnnHandle() const;
cudaStream_t* getCudaStream() const;
cudaStream_t* getCudaSpecialStream() const;

View File

@ -166,6 +166,10 @@ LaunchContext::LaunchContext() {
return contextBuffers.isInitialized();
}
void* LaunchContext::getCuDnnHandle() const {
return CublasHelper::getInstance()->cudnn();
}
sd::ErrorReference* LaunchContext::errorReference() {
return contextBuffers.errorReference();
}

View File

@ -27,6 +27,7 @@
#include <graph/VariableSpace.h>
#include <graph/ContextPrototype.h>
#include <memory/Workspace.h>
#include <execution/Engine.h>
// CUDA-specific includes
#ifdef __CUDACC__
@ -102,12 +103,13 @@ namespace nd4j {
// this method returns workspace for object allocations
nd4j::memory::Workspace* oWorkspace();
void setVariableSpace(VariableSpace* variableSpace);
nd4j::random::RandomBuffer* getRNG();
void setRNG(nd4j::random::RandomBuffer* rng);
void setTargetEngine(samediff::Engine engine);
VariableSpace *getVariableSpace();
LaunchContext* launchContext();

View File

@ -27,6 +27,11 @@
#include <dll.h>
#include <RandomGenerator.h>
#include <ops/declarable/OpDescriptor.h>
#include <execution/Engine.h>
#ifndef __STANDALONE_BUILD__
#include <config.h>
#endif
namespace nd4j {
namespace graph {
@ -53,6 +58,8 @@ namespace nd4j {
nd4j::ops::OpDescriptor* _opDescriptor;
bool _useMKLDNN = nd4j::Environment::getInstance()->isUseMKLDNN();
// target engine for execution
samediff::Engine _engine = DEFAULT_ENGINE;
public:
explicit ContextPrototype(nd4j::ops::OpDescriptor* opDescriptor = nullptr, int nodeId = 1, bool inPlace = false);
~ContextPrototype() = default;
@ -84,6 +91,8 @@ namespace nd4j {
std::vector<bool>* getBArguments();
std::vector<int>* getAxis();
samediff::Engine engine();
size_t numT();
size_t numI();
size_t numB();

View File

@ -107,6 +107,10 @@ namespace nd4j {
delete _context;
}
void Context::setTargetEngine(samediff::Engine engine) {
_engine = engine;
}
bool Context::hasWorkspaceProvided() {
return this->_workspace != nullptr;
}

View File

@ -59,6 +59,10 @@ namespace nd4j {
}
}
samediff::Engine ContextPrototype::engine() {
return _engine;
}
bool ContextPrototype::hasVariablesFilled() {
return this->_inputs.size() > 0;
}

View File

@ -34,12 +34,14 @@ namespace nd4j {
std::vector<void*> _cache;
std::vector<void*> _solvers;
std::vector<void*> _cudnn;
CublasHelper();
~CublasHelper();
public:
static CublasHelper* getInstance();
void* cudnn();
void* solver();
void* handle();

View File

@ -25,6 +25,13 @@
#include <exceptions/cuda_exception.h>
#include <helpers/logger.h>
#include <execution/AffinityManager.h>
#include "config.h"
#ifdef HAVE_CUDNN
#include <cudnn.h>
#endif
namespace nd4j {
std::mutex CublasHelper::_mutex;
@ -47,6 +54,18 @@ namespace nd4j {
return cusolverH;
}
static void* cudnn_() {
#ifdef HAVE_CUDNN
auto cudnnH = new cudnnHandle_t();
auto status = cudnnCreate(cudnnH);
if (status != CUDNN_STATUS_SUCCESS)
throw cuda_exception::build("cuDNN handle creation failed !", status);
return cudnnH;
#endif
return nullptr;
}
static void destroyHandle_(void* handle) {
auto ch = reinterpret_cast<cublasHandle_t *>(handle);
auto status = cublasDestroy_v2(*ch);
@ -62,11 +81,13 @@ namespace nd4j {
auto currentDevice = AffinityManager::currentDeviceId();
_cache.resize(numDevices);
_solvers.resize(numDevices);
_cudnn.resize(numDevices);
for (int e = 0; e < numDevices; e++) {
AffinityManager::setCurrentNativeDevice(e);
_cache[e] = handle_();
_solvers[e] = solver_();
_cudnn[e] = cudnn_();
}
// don't forget to restore back original device
@ -90,6 +111,14 @@ namespace nd4j {
return _INSTANCE;
}
void* CublasHelper::cudnn() {
auto deviceId = AffinityManager::currentDeviceId();
if (deviceId < 0 || deviceId > _cudnn.size())
throw cuda_exception::build("requested deviceId doesn't look valid", deviceId);
return _cudnn[deviceId];
}
void* CublasHelper::handle() {
auto deviceId = AffinityManager::currentDeviceId();
return handle(deviceId);

View File

@ -35,12 +35,12 @@ static __global__ void simpleIndexReduceGeneric(const int op,
Nd4jLong *xShapeInfo, int xRank,
void *extraParams,
void *result,
Nd4jLong *resultShapeInfo, int zRank,
Nd4jLong *zShapeInfo, int zRank,
int *dimension,
int dimensionLength,
int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,resultShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
functions::indexreduce::IndexReduce<X, Z>::transform(op,dx,xShapeInfo,extraParams,result,zShapeInfo,dimension,dimensionLength,postProcessOrNot,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
}
namespace functions {
@ -52,7 +52,7 @@ namespace functions {
void *dx, Nd4jLong *xShapeInfo,
int xRank,
void *extraParams,
void *result, Nd4jLong *resultShapeInfo,
void *result, Nd4jLong *zShapeInfo,
int zRank,
int *dimension, int dimensionLength,
int postProcessOrNot,
@ -62,7 +62,7 @@ namespace functions {
simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(opNum,
dx, xShapeInfo, xRank,
extraParams,
result, resultShapeInfo, 0,
result, zShapeInfo, 0,
nullptr, 0,
1,
allocationBuffer, reductionBuffer,
@ -70,14 +70,14 @@ namespace functions {
}
template <typename X, typename Z>
_CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
_CUDA_H void IndexReduce<X, Z>::executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int opNum, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
simpleIndexReduceGeneric<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(
opNum,
dx,
xShapeInfo, xRank,
extraParams,
result,
resultShapeInfo, zRank,
zShapeInfo, zRank,
dimension,
dimensionLength,
1, allocationBuffer, reductionBuffer, tadOnlyShapeInfo, tadOffsets);
@ -158,7 +158,7 @@ namespace functions {
Nd4jLong *xShapeInfo,
void *extraParams,
void *result,
Nd4jLong *resultShapeInfo,
Nd4jLong *zShapeInfo,
int *dimension,
int dimensionLength,
int postProcessOrNot,
@ -166,7 +166,7 @@ namespace functions {
void *reductionBuffer,
Nd4jLong *tadShapeInfo,
Nd4jLong *tadOffset) {
DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, resultShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, result, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
}
@ -174,7 +174,7 @@ namespace functions {
template <typename OpType>
__device__ void IndexReduce<X, Z>::transform(void *vdx, Nd4jLong *xShapeInfo,
void *vextraParams,
void *vresult, Nd4jLong *resultShapeInfo,
void *vz, Nd4jLong *zShapeInfo,
int *dimension, int dimensionLength,
int postProcessOrNot,
int *allocationBuffer, void *vreductionBuffer,
@ -183,7 +183,7 @@ namespace functions {
* Gpu information for the problem
*/
auto dx = reinterpret_cast<X*>(vdx);
auto result = reinterpret_cast<Z*>(vresult);
auto z = reinterpret_cast<Z*>(vz);
auto extraParams = static_cast<X*>(vextraParams);
auto reductionBuffer = static_cast<X*>(vreductionBuffer);
auto order = shape::order(xShapeInfo);
@ -203,19 +203,19 @@ namespace functions {
//length for the tad
__shared__ volatile Nd4jLong xLength;
__shared__ volatile Nd4jLong resultLength;
__shared__ volatile Nd4jLong zLen;
//only compute the tad indexes once
IndexValue<X> reduction = OpType::startingIndexValue(dx);
if (threadIdx.x == 0) {
if (resultShapeInfo != nullptr)
resultLength = shape::length(resultShapeInfo);
else resultLength = 1;
if (zShapeInfo != nullptr)
zLen = shape::length(zShapeInfo);
else zLen = 1;
if (dimensionLength == 1) {
if (resultLength == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
if (zLen == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION))
resultScalar = 1;
else
resultScalar = 0;
@ -223,13 +223,24 @@ namespace functions {
else
resultScalar = 0;
if (resultLength == 1)
if (zLen == 1)
resultScalar = 1;
xLength = shape::length(xShapeInfo);
}
__syncthreads();
if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
return;
for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x)
z[i] = (Z) reduction.index;
return;
}
if (!resultScalar) {
__shared__ Nd4jLong tadLength;
@ -261,7 +272,7 @@ namespace functions {
__syncthreads();
if (threadIdx.x == 0) {
result[r] = (Z) sPartials[threadIdx.x].index;
z[r] = (Z) sPartials[threadIdx.x].index;
}
__syncthreads();
}
@ -282,7 +293,7 @@ namespace functions {
__syncthreads();
if (threadIdx.x == 0) {
result[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
z[i] = (Z) sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
}
__syncthreads();
}
@ -345,14 +356,14 @@ namespace functions {
__syncthreads();
if (tid == 0) {
result[0] = (Z) sPartials[0].index;
z[0] = (Z) sPartials[0].index;
}
}
} else {
if (tid == 0) {
auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
tc[16384] = 0;
result[0] = (Z) sPartials[0].index;
z[0] = (Z) sPartials[0].index;
}
}

View File

@ -143,7 +143,7 @@ namespace nd4j {
cudaFreeHost((void *)this->_ptrHost);
if (this->_allocatedDevice && !_externalized)
cudaFree((void *)this->_ptrHost);
cudaFree((void *)this->_ptrDevice);
freeSpills();
}

View File

@ -27,6 +27,7 @@
#include <mutex>
#include <ops/declarable/DeclarableOp.h>
#include <ops/declarable/PlatformHelper.h>
#include <execution/Engine.h>
// handlers part
#include <cstdlib>
@ -66,8 +67,8 @@ namespace nd4j {
std::vector<nd4j::ops::DeclarableOp *> _uniqueD;
// pointers to platform-specific helpers
std::map<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
std::map<std::string, nd4j::ops::platforms::PlatformHelper*> _helpersH;
std::map<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
std::map<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersH;
std::vector<nd4j::ops::platforms::PlatformHelper*> _uniqueH;
std::mutex _locker;
@ -98,13 +99,13 @@ namespace nd4j {
void registerHelper(nd4j::ops::platforms::PlatformHelper* op);
bool hasHelper(Nd4jLong hash);
bool hasHelper(Nd4jLong hash, samediff::Engine engine);
nd4j::ops::DeclarableOp* getOperation(const char *name);
nd4j::ops::DeclarableOp* getOperation(Nd4jLong hash);
nd4j::ops::DeclarableOp* getOperation(std::string &name);
nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash);
nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash, samediff::Engine engine);
std::vector<Nd4jLong> getAllHashes();

View File

@ -22,6 +22,7 @@
#define SD_PLATFORMHELPER_H
#include <ShapeUtils.h>
#include <execution/Engine.h>
#include <graph/Context.h>
#include <string>
#include <pointercast.h>
@ -35,18 +36,23 @@ namespace nd4j {
*/
class ND4J_EXPORT PlatformHelper {
protected:
// target engine for this impl
samediff::Engine _engine;
// name of the operation this helper is built for
std::string _name;
// hash of the operation this helper is built for
Nd4jLong _hash;
public:
PlatformHelper(const char *name);
PlatformHelper(const char *name, samediff::Engine engine);
~PlatformHelper() = default;
std::string name();
samediff::Engine engine();
Nd4jLong hash();
/**

View File

@ -199,16 +199,16 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
int dH = INT_ARG(10); // dilations height
int dW = INT_ARG(11); // dilations width
int paddingMode = INT_ARG(12); // 1-SAME, 0-VALID
int isNDHWC = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
int trueoD, trueoH, trueoW; // true output depth/height/width
ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);
REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D_BP OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
std::string expectedGradOShape = ShapeUtils::shapeAsString(ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW, 0,indIOioC,indIOioD,indIOioD+1,indIOioD+2}));
std::string expectedWeightsShape = ShapeUtils::shapeAsString({kD, kH, kW, iC, oC});
REQUIRE_TRUE(expectedGradOShape == ShapeUtils::shapeAsString(gradO), 0, "CUSTOM CONV3D_BP OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", expectedGradOShape.c_str(), ShapeUtils::shapeAsString(gradO).c_str());
@ -222,7 +222,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
std::vector<int> gradOaxesForDot;
if(!isNDHWC) {
if(!isNCDHW) {
gradOaxesForDot = {0,1,2,3}; // bS, oD, oH, oW
input = new NDArray(input->permute({0,4,1,2,3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
gradI = new NDArray(gradI->permute({0,4,1,2,3})); // [bS, iD, iH, iW, iC] -> [bS, iC, iD, iH, iW]
@ -249,7 +249,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2,3,4,1,0,5,6,7}); // [kD, kH, kW, iC, oC] x [bS, oD, oH, oW, oC]/[bS, oC, oD, oH, oW] = [kD, kH, kW, iC, bS, oD, oH, oW]
ConvolutionUtils::col2vol(block, columns, *gradI, sD, sH, sW, pD, pH, pW, dD, dH, dW); // columns [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to [bS, iC, iD, iH, iW]
if(!isNDHWC) {
if(!isNCDHW) {
delete input;
delete gradI;
}
@ -287,7 +287,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
int dH = INT_ARG(10); // dilations height
int dW = INT_ARG(11); // dilations width
int paddingMode = INT_ARG(12); // 1-SAME, 0-VALID
int isNDHWC = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
const int rank = 5;
REQUIRE_TRUE(paddingMode < 2, 0, "CUSTOM CONV3D OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
@ -296,7 +296,7 @@ DECLARE_SHAPE_FN(conv3dnew_bp) {
REQUIRE_TRUE(gradOShapeInfo[0] == rank, 0, "CUSTOM CONV3D_BP OP: rank of output gradients (next epsilon) array must be equal to %i, but got %i instead !", rank, gradOShapeInfo);
int indIOioC, indIiD, indWoC(4);
if(!isNDHWC) {
if(!isNCDHW) {
indIOioC = 4; indIiD = 1;
}
else {

View File

@ -41,8 +41,10 @@ namespace nd4j {
static inline void calcOutSizePool2D(int& oH, int& oW, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int iH, const int iW, const int paddingMode) {
if(paddingMode == 0) { // valid
oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
// oH = (iH - (kH + (kH-1)*(dH-1)) + 2*pH)/sH + 1;
// oW = (iW - (kW + (kW-1)*(dW-1)) + 2*pW)/sW + 1;
oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
}
else if (paddingMode == 1) { // same
oH = (int) math::nd4j_ceil<double, double>(iH * 1. / sH);
@ -57,9 +59,9 @@ namespace nd4j {
static inline void calcOutSizePool3D(int& oD, int& oH, int& oW, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int iD, const int iH, const int iW, const int paddingMode) {
if(paddingMode == 0) { // valid
oD = (iD - (kD + (kD - 1) * (dD - 1)) + 2 * pD) / sD + 1;
oH = (iH - (kH + (kH - 1) * (dH - 1)) + 2 * pH) / sH + 1;
oW = (iW - (kW + (kW - 1) * (dW - 1)) + 2 * pW) / sW + 1;
oD = (iD - ((kD - 1) * dD + 1) + 2 * pD) / sD + 1;
oH = (iH - ((kH - 1) * dH + 1) + 2 * pH) / sH + 1;
oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
}
else if(paddingMode == 1) { // same
oD = (int) nd4j::math::nd4j_ceil<double, double>(iD * 1. / sD);

View File

@ -1121,8 +1121,12 @@ namespace helpers {
I const* cropSizes = reinterpret_cast<I const*>(cropSize->getSpecialBuffer());
T* outBuf = reinterpret_cast<T*>(crops->specialBuffer());
int threadsPerBlock = math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth);
if(threadsPerBlock > MAX_NUM_THREADS/4)
threadsPerBlock = MAX_NUM_THREADS/4;
NDArray::prepareSpecialUse({crops}, {images, boxes, indices, cropSize});
cropAndResizeKernel<T,Z,I><<<batchSize, math::nd4j_max(imageHeight * imageWidth, cropHeight * cropWidth), 512, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
cropAndResizeKernel<T,Z,I><<<batchSize, threadsPerBlock, 256, *stream>>>(imagesBuf, images->getSpecialShapeInfo(), boxesBuf, boxes->getSpecialShapeInfo(), indexBuf, indices->getSpecialShapeInfo(),
cropSizes, cropSize->getSpecialShapeInfo(), method, extrapolationVal, outBuf, crops->specialShapeInfo(), numBoxes, cropHeight, cropWidth, batchSize, imageHeight, imageWidth, depth);
NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize});
}

View File

@ -30,7 +30,7 @@ namespace helpers {
//////////////////////////////////////////////////////////////////////////
// calculate digamma function for array elements
template <typename T>
static void lgamma_(NDArray& x, NDArray& z) {
void lgamma_(NDArray& x, NDArray& z) {
//auto dtype = x.dataType();
auto lgammaProc = LAMBDA_T(x_, dtype) {
return T(DataTypeUtils::fromT<T>() == DataType::DOUBLE?::lgamma(x_): ::lgammaf(x_)); //math::nd4j_log<T,T>(math::nd4j_gamma<T,T>(x));

View File

@ -535,8 +535,8 @@ namespace nd4j {
// platform helpers use might be forbidden for various reasons, so we'll check it out first
if (block->helpersAllowed() && nd4j::Environment::getInstance()->helpersAllowed()) {
// if we have platform-specific helper for this op - invoke it
if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) {
auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash());
if (OpRegistrator::getInstance()->hasHelper(this->getOpHash(), block->engine())) {
auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash(), block->engine());
if (helper->isUsable(*block)) {
status = helper->invokeHelper(*block);
hasHelper = true;

View File

@ -69,9 +69,9 @@ namespace nd4j {
} else if (block.getTArguments()->size() > 0) {
auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
NDArray::prepareSpecialUse({z}, {x, &y});
NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
x->applyScalarArr(static_cast<nd4j::scalar::Ops>(opNum), y, *z);
// NDArray::prepareSpecialUse({z}, {x, &y});
// NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
manager.synchronize();
} else {

View File

@ -173,15 +173,18 @@ namespace nd4j {
}
void OpRegistrator::registerHelper(nd4j::ops::platforms::PlatformHelper* op) {
if (_helpersLH.count(op->hash()) > 0)
std::pair<Nd4jLong, samediff::Engine> p = {op->hash(), op->engine()};
if (_helpersLH.count(p) > 0)
throw std::runtime_error("Tried to double register PlatformHelper");
_uniqueH.emplace_back(op);
std::pair<std::string, nd4j::ops::platforms::PlatformHelper*> pair(op->name(), op);
nd4j_debug("Adding helper for op \"%s\": [%lld - %i]\n", op->name().c_str(), op->hash(), (int) op->engine());
std::pair<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair({op->name(), op->engine()}, op);
_helpersH.insert(pair);
std::pair<Nd4jLong, nd4j::ops::platforms::PlatformHelper*> pair2(op->hash(), op);
std::pair<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair2(p, op);
_helpersLH.insert(pair2);
}
@ -227,15 +230,17 @@ namespace nd4j {
return _declarablesD.at(name);
}
nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash) {
if (_helpersLH.count(hash) == 0)
nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash, samediff::Engine engine) {
std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
if (_helpersLH.count(p) == 0)
throw std::runtime_error("Requested helper can't be found");
return _helpersLH[hash];
return _helpersLH[p];
}
bool OpRegistrator::hasHelper(Nd4jLong hash) {
return _helpersLH.count(hash) > 0;
bool OpRegistrator::hasHelper(Nd4jLong hash, samediff::Engine engine) {
std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
return _helpersLH.count(p) > 0;
}
int OpRegistrator::numberOfOperations() {

View File

@ -24,10 +24,11 @@
namespace nd4j {
namespace ops {
namespace platforms {
PlatformHelper::PlatformHelper(const char *name) {
PlatformHelper::PlatformHelper(const char *name, samediff::Engine engine) {
// we just store name/hash of target operation
_name = std::string(name);
_hash = HashHelper::getInstance()->getLongHash(_name);
_engine = engine;
}
nd4j::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) {
@ -74,6 +75,10 @@ namespace nd4j {
return z;
}
samediff::Engine PlatformHelper::engine() {
return _engine;
}
std::string PlatformHelper::name() {
return _name;
}

View File

@ -0,0 +1,275 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include "cudnnUtils.h"
#include <ops/declarable/helpers/convolutions.h>
namespace nd4j {
namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////////
static void batchnormCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* mean, const NDArray* variance,
const NDArray* gamma, const NDArray* beta,
NDArray* output,
const double epsilon, const bool isSpatialMode) {
// input, output -> 4D:nchw, 5D:ncdhw
// mean, variance, gamma, beta -> 1xCx1x1 for 4D and 1xCx1x1x1 for 5D for BATCHNORM_MODE_SPATIAL mode
// -> 1xCxHxW for 4D and 1xCxDxHxW for 5D for BATCHNORM_MODE_PER_ACTIVATION mode
const cudnnDataType_t dataType = cudnnDataType(input->dataType());
const int xRank = input->rankOf();
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
const std::vector<int> xShape = input->getShapeAsVectorInt(); // input and output have same shapes
std::vector<int> paramsShape, paramsStrides; // mean, variance, gamma and beta have same shapes
if(isSpatialMode) { // 1xCx1x1
const int iC = mean->lengthOf();
const int stride0 = mean->strideAt(0);
paramsShape = xRank == 4 ? std::vector<int>({1, iC, 1, 1}) : std::vector<int>({1, iC, 1, 1, 1});
paramsStrides = xRank == 4 ? std::vector<int>({iC*stride0, stride0, 1, 1}) : std::vector<int>({iC*stride0, stride0, 1, 1, 1});
}
else {
paramsShape = mean->getShapeAsVectorInt();
paramsStrides = xRank == 4 ? std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3)}) : std::vector<int>({(int)mean->strideAt(0), (int)mean->strideAt(1), (int)mean->strideAt(2), (int)mean->strideAt(3), (int)mean->strideAt(4)});
}
std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3)};
std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3)};
if(xRank > 4) { // 5D
xStrides.push_back((int)input->strideAt(4));
zStrides.push_back((int)output->strideAt(4));
}
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
else
err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
// output descriptor
cudnnTensorDescriptor_t z;
cudnnCreateTensorDescriptor(&z);
if(output->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(z, format, dataType, xRank, xShape.data());
else
err = cudnnSetTensorNdDescriptor(z, dataType, xRank, xShape.data(), zStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
// mean, variance, gamma and beta descriptor, the same descriptor for all of them
cudnnTensorDescriptor_t params;
cudnnCreateTensorDescriptor(&params);
if(mean->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
else
err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* ptrAlpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* ptrBeta = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({output}, {input, mean, variance, gamma, beta});
// calculations
err = cudnnBatchNormalizationForwardInference(*handle, isSpatialMode ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION,
ptrAlpha, ptrBeta,
x, input->getSpecialBuffer(),
z, output->getSpecialBuffer(),
params,
gamma ? gamma->getSpecialBuffer(): nullptr,
beta ? beta->getSpecialBuffer() : nullptr,
mean->getSpecialBuffer(), variance->getSpecialBuffer(), epsilon);
if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("batchnormCUDNN: cudaStreamSynchronize failed !", cudaErr);
NDArray::registerSpecialUse({output}, {input, mean, variance, gamma, beta});
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(batchnorm, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0);
auto mean = INPUT_VARIABLE(1);
auto variance = INPUT_VARIABLE(2);
NDArray* gamma = nullptr;
NDArray* beta = nullptr;
auto output = OUTPUT_VARIABLE(0);
const bool applyScale = (bool)INT_ARG(0);
const bool applyOffset = (bool)INT_ARG(1);
const double epsilon = T_ARG(0);
if(applyScale)
gamma = INPUT_VARIABLE(3);
if(applyOffset)
beta = INPUT_VARIABLE(3 + (int)applyScale);
const int numOfIntArgs = block.getIArguments()->size();
const int inRank = input->rankOf();
// get axes args to normalize input array over
std::vector<int> axes;
if(numOfIntArgs > 2)
for(int i = 2; i < numOfIntArgs; ++i)
axes.push_back(INT_ARG(i));
else
axes.push_back(inRank-1); // default dimension to reduce along is last dimension
const int numOfAxes = axes.size();
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM CUDNN op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
// for example if input shape is {2,3,4,5,6} and axes = {1,3}, then expected shape would be {1,3,1,5,1}, and if axes = {3}, then expected shape would be {5}
std::vector<Nd4jLong> expShape;
if(numOfAxes == 1)
expShape.push_back(input->sizeAt(axes[0]));
else { // get, for example, something like {1, inputDim1, 1, inputDim3, 1} if axes = {1, 3}
expShape = std::vector<Nd4jLong>(inRank, 1);
for(uint i = 0; i < numOfAxes; ++i)
expShape[axes[i]] = input->sizeAt(axes[i]);
}
REQUIRE_TRUE(mean->isSameShape(expShape) , 0, "BATCHNORM CUDNN op: wrong shape of mean array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(mean).c_str());
REQUIRE_TRUE(variance->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of variance array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(variance).c_str());
if(gamma)
REQUIRE_TRUE(gamma->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of gamma array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(gamma).c_str());
if(beta)
REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM CUDNN op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
// types of all input arrays should be the same
for(int i = 1; i < block.width(); ++i)
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM CUDNN op: types of all input arrays should be the same !");
// cudnn supports NCHW format only
const bool needPermut = axes.size() == 1 && mean->lengthOf() == input->sizeAt(-1);
if(needPermut) { // if NHWC
std::vector<int> perm = {0, 3, 1, 2}; // NHWC -> NCHW
input = new NDArray(input->permute(perm));
output = new NDArray(output->permute(perm));
}
// calculations
batchnormCUDNN(block.launchContext(), input, mean, variance, gamma, beta, output, epsilon, axes.size() == 1);
if(needPermut) {
delete input;
delete output;
}
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(batchnorm, ENGINE_CUDA) {
const bool applyScale = (bool)INT_ARG(0);
const bool applyOffset = (bool)INT_ARG(1);
NDArray* input = INPUT_VARIABLE(0);
NDArray* mean = INPUT_VARIABLE(1);
NDArray* variance = INPUT_VARIABLE(2);
NDArray* gamma = applyScale ? INPUT_VARIABLE(3) : nullptr;
NDArray* beta = applyOffset ? INPUT_VARIABLE(3 + (int)applyScale) : nullptr;
const int numOfIntArgs = block.getIArguments()->size();
const int xRank = input->rankOf();
// disable cudnn batchnorm so far
return false;
// *********************************** //
if(xRank != 4 && xRank != 5)
return false;
// *********************************** //
const bool badType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
if(badType)
return false;
// *********************************** //
// get axes args to normalize input array over
std::vector<int> axes;
if(numOfIntArgs > 2)
for(int i = 2; i < numOfIntArgs; ++i)
axes.push_back(INT_ARG(i));
else
axes.push_back(xRank-1); // default dimension to reduce along is last dimension
if(axes.size() != 1 && axes.size() != 3 && axes.size() != 4)
return false;
// *********************************** //
bool allParamsHaveSameShapeAndStrides = shape::haveSameShapeAndStrides(mean->getShapeInfo(), variance->getShapeInfo());
if(gamma)
allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), gamma->getShapeInfo());
if(beta)
allParamsHaveSameShapeAndStrides &= shape::haveSameShapeAndStrides(mean->getShapeInfo(), beta->getShapeInfo());
if(!allParamsHaveSameShapeAndStrides)
return false;
// *********************************** //
bool isFormatGood = false;
if(axes.size() == 1)
isFormatGood = mean->lengthOf() == input->sizeAt(1) || mean->lengthOf() == input->sizeAt(-1); // mean [C]
else {
auto inputShapeModif = input->getShapeAsVector(); // [dim0,dim1,dim2,dim3] 4D or [dim0,dim1,dim2,dim3,dim4]
inputShapeModif[0] = 1;
isFormatGood = mean->isSameShape(inputShapeModif); // mean [1,dim1,dim2,dim3] 4D or [1,dim1,dim2,dim3,dim4]
}
if(!isFormatGood)
return false;
return true;
}
}
}
}

View File

@ -0,0 +1,521 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include "cudnnUtils.h"
#include <ops/declarable/helpers/convolutions.h>
namespace nd4j {
namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////////
static void conv2dCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
const int kH, const int kW,
const int sH, const int sW,
const int pH, const int pW,
const int dH, const int dW,
const int paddingMode, const bool isNCHW) {
int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
// weights descriptor
cudnnFilterDescriptor_t w;
cudnnCreateFilterDescriptor(&w);
err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
if(err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
// output descriptor
cudnnTensorDescriptor_t z;
cudnnCreateTensorDescriptor(&z);
if(output->ews() == 1)
err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
else
err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
// algorithm description
cudnnConvolutionFwdAlgo_t algo;
err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
// allocate auxiliary device memory, abbreviation ws means workspace
size_t wsSize;
err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
void* wsData;
auto cudaErr = cudaMalloc(&wsData, wsSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({output}, {input, weights, bias});
// run calculation
err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnConvolutionForward failed", err);
// add bias if it is present
if (bias != nullptr) {
cudnnTensorDescriptor_t b;
cudnnCreateTensorDescriptor(&b);
err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnAddTensor bias failed", err);
}
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("conv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
NDArray::registerSpecialUse({output}, {input, weights, bias});
}
//////////////////////////////////////////////////////////////////////////
static void conv2dBpCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* gradO,
NDArray* gradI, NDArray* gradW, NDArray* gradB,
const int kH, const int kW,
const int sH, const int sW,
const int pH, const int pW,
const int dH, const int dW,
const int paddingMode, const bool isNCHW) {
int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: can't set stream for cuDNN", err);
cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
// gradO descriptor
cudnnTensorDescriptor_t dz;
cudnnCreateTensorDescriptor(&dz);
if(gradO->ews() == 1)
err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
else
err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
// gradI descriptor
cudnnTensorDescriptor_t dx;
cudnnCreateTensorDescriptor(&dx);
if(gradI->ews() == 1)
err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
// gradW descriptor
cudnnFilterDescriptor_t dw;
cudnnCreateFilterDescriptor(&dw);
err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
if(err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
// gradW algorithm description
cudnnConvolutionBwdFilterAlgo_t algoGradW;
err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
// gradI algorithm description
cudnnConvolutionBwdDataAlgo_t algoGradI;
err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
// allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
size_t wsGradWSize;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
void* wsGradWData;
auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
// allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
size_t wsGradISize;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
void* wsGradIData;
cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
// run calculation for gradB (if not nullptr)
if(gradB != nullptr) {
cudnnTensorDescriptor_t db;
cudnnCreateTensorDescriptor(&db);
err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
}
// run calculation for gradW
err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
// run calculation for gradI
err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("conv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsGradWData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
cudaErr = cudaFree(wsGradIData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto output = OUTPUT_VARIABLE(0); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
int sH = INT_ARG(2); // strides height
int sW = INT_ARG(3); // strides width
int pH = INT_ARG(4); // paddings height
int pW = INT_ARG(5); // paddings width
int dH = INT_ARG(6); // dilations height
int dW = INT_ARG(7); // dilations width
int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME
bool isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
REQUIRE_TRUE(input->rankOf() == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
if (bias) {
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
REQUIRE_TRUE((bias->rankOf() == 1 && bias->strideAt(0) == 1) || (bias->rankOf() == 2 && bias->sizeAt(0) == 1 && bias->strideAt(1) == 1) || (bias->rankOf() == 2 && bias->sizeAt(1) == 1 && bias->strideAt(0) == 1), 0, "CUSTOM CONV2D CUDNN OP: bias array should be contiguous in memory !");
}
NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC --> oC, iC, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = nullptr;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
conv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
if(newInput != input)
delete newInput;
delete newWeights;
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(conv2d, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
const int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME, 2-CAUSAL
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv2d_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
auto gradI = OUTPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
auto gradW = OUTPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC]
int kH = INT_ARG(0); // filter(kernel) height
int kW = INT_ARG(1); // filter(kernel) width
int sH = INT_ARG(2); // strides height
int sW = INT_ARG(3); // strides width
int pH = INT_ARG(4); // paddings height
int pW = INT_ARG(5); // paddings width
int dH = INT_ARG(6); // dilations height
int dW = INT_ARG(7); // dilations width
int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME
int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
REQUIRE_TRUE(input->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
REQUIRE_TRUE(gradO->rankOf() == 4, 0, "CUSTOM CONV2D_BP CUDNN OP: rank of output's gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
int trueoH, trueoW; // true output height, width
ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW, 0,indIOioC,indOoH,indOoH+1});
std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, oC};
REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
if(bias)
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CUSTOM CONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
NDArray* newGradW = new NDArray(gradW->ordering(), {oC, iC, kH, kW}, gradW->dataType(), gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kH, kW}, weights->dataType(), weights->getContext());
newWeights->assign(weights->permute({3,2,0,1})); // permute weights (kH, kW, iC, oC --> oC, iC, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = gradI;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
conv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO, newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
newGradW->permutei({2,3,1,0}); // [oC, iC, kH, kW] -> [kH, kW, iC, oC]
gradW->assign(newGradW);
if(newInput != input) {
if(isNCHW)
gradI->assign((*newGradI)({0,0, 0,0, 0,gradI->sizeAt(2), 0,gradI->sizeAt(3)}));
else
gradI->assign((*newGradI)({0,0, 0,gradI->sizeAt(1), 0,gradI->sizeAt(2), 0,0}));
delete newInput;
delete newGradI;
}
delete newWeights;
delete newGradW;
return Status::OK();
}
PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
const int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME, 2-CAUSAL
const int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badGradOType = gradO->dataType() != DataType::DOUBLE && gradO->dataType() != DataType::FLOAT32 && gradO->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
}
// PLATFORM_IMPL(conv2d, ENGINE_CUDA) {
// auto handle = reinterpret_cast<cudnnHandle_t *>(block.launchContext()->getCuDnnHandle());
// auto res = cudnnSetStream(*handle, *block.launchContext()->getCudaStream());
// if (res != 0)
// throw nd4j::cuda_exception::build("Can't set stream for cuDNN", res);
// auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
// auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
// auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
// auto output = OUTPUT_VARIABLE(0); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
// NDArray::prepareSpecialUse({output}, {input, weights, bias});
// int sH = INT_ARG(2); // strides height
// int sW = INT_ARG(3); // strides width
// int pH = INT_ARG(4); // paddings height
// int pW = INT_ARG(5); // paddings width
// int dH = INT_ARG(6); // dilations height
// int dW = INT_ARG(7); // dilations width
// int isSameMode = INT_ARG(8); // 0-VALID, 1-SAME
// bool isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
// int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0)); // filter(kernel) height
// int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1)); // filter(kernel) width
// int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
// int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH; // corresponding indexes
// ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
// ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, isSameMode);
// auto dtype = cudnnDataType(input->dataType());
// cudnnTensorDescriptor_t src;
// cudnnCreateTensorDescriptor(&src);
// res = cudnnSetTensor4dDescriptorEx(src, dtype, input->sizeAt(0), input->sizeAt(1), input->sizeAt(2), input->sizeAt(3), input->strideAt(0), input->strideAt(1), input->strideAt(2), input->strideAt(3));
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx src failed", res);
// // TODO: we definitely want NHWC here as well
// cudnnFilterDescriptor_t wght;
// cudnnCreateFilterDescriptor(&wght);
// res = cudnnSetFilter4dDescriptor(wght, dtype, CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetFilter4dDescriptor failed", res);
// cudnnConvolutionDescriptor_t cdc;
// cudnnCreateConvolutionDescriptor(&cdc);
// res = cudnnSetConvolution2dDescriptor(cdc, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, dtype);
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetConvolution2dDescriptor failed", res);
// cudnnTensorDescriptor_t dst;
// cudnnCreateTensorDescriptor(&dst);
// res = cudnnSetTensor4dDescriptorEx(dst, dtype, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3), output->strideAt(0), output->strideAt(1), output->strideAt(2), output->strideAt(3));
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx dst failed", res);
// // TODO: workspace algorithms are supposed to be faster, so we should use it here if we have enough memory
// cudnnConvolutionFwdAlgo_t algo;
// res = cudnnGetConvolutionForwardAlgorithm(*handle, src, wght, cdc, dst, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnGetConvolutionForwardAlgorithm failed", res);
// // TODO: should be float if dtype is half/float, and double otherwise
// float alpha = 1.0f;
// float beta = 0.0f;
// res = cudnnConvolutionForward(*handle, &alpha, src, input->specialBuffer(), wght, weights->specialBuffer(), cdc, algo, nullptr, 0, &beta, dst, output->specialBuffer());
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnConvolutionForward failed", res);
// if (bias != nullptr) {
// cudnnTensorDescriptor_t bs;
// cudnnCreateTensorDescriptor(&bs);
// if (isNCHW) {
// res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NCHW, dtype, 1, bias->lengthOf(), 1, 1);
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
// } else {
// res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NHWC, dtype, 1, 1, 1, bias->lengthOf());
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
// }
// res = cudnnAddTensor(*handle, &alpha, bs, bias->specialBuffer(), &alpha, dst, output->specialBuffer());
// if (res != 0)
// throw nd4j::cuda_exception::build("cudnnAddTensor failed", res);
// }
// NDArray::registerSpecialUse({output}, {input, weights, bias});
// return Status::OK();
// }
}
}
}

View File

@ -0,0 +1,453 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include "cudnnUtils.h"
#include <ops/declarable/helpers/convolutions.h>
namespace nd4j {
namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////////
static void conv3dCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
const int kD, const int kH, const int kW,
const int sD, const int sH, const int sW,
const int pD, const int pH, const int pW,
const int dD, const int dH, const int dW,
const int paddingMode, const bool isNCDHW) {
const int numDims = 5;
int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: can't set stream for cuDNN", err);
const std::vector<int> pads = {pD, pH, pW};
const std::vector<int> filtStrides = {sD, sH, sW};
const std::vector<int> dilations = {dD, dH, dW};
const std::vector<int> xShape = {bS, iC, iD, iH, iW};
const std::vector<int> zShape = {bS, oC, oD, oH, oW};
const std::vector<int> wShape = {oC, iC, kD, kH, kW};
const std::vector<int> bShape = {1, (isNCDHW ? oC : 1), 1, 1, (isNCDHW ? 1 : oC)};
const std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
const std::vector<int> zStrides = {(int)output->strideAt(0), (int)output->strideAt(1), (int)output->strideAt(2), (int)output->strideAt(3), (int)output->strideAt(4)};
cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
else
err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
// weights descriptor
cudnnFilterDescriptor_t w;
cudnnCreateFilterDescriptor(&w);
err = cudnnSetFilterNdDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
if(err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetFilterNdDescriptor failed", err);
// output descriptor
cudnnTensorDescriptor_t z;
cudnnCreateTensorDescriptor(&z);
if(output->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(z, format, cudnnDataType(output->dataType()), numDims, zShape.data());
else
err = cudnnSetTensorNdDescriptor(z, cudnnDataType(output->dataType()), numDims, zShape.data(), zStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
// algorithm description
cudnnConvolutionFwdAlgo_t algo;
err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
// allocate auxiliary device memory, abbreviation ws means workspace
size_t wsSize;
err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
void* wsData;
auto cudaErr = cudaMalloc(&wsData, wsSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({output}, {input, weights, bias});
// run calculation
err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnConvolutionForward failed", err);
// add bias if it is present
if (bias != nullptr) {
cudnnTensorDescriptor_t b;
cudnnCreateTensorDescriptor(&b);
err = cudnnSetTensorNdDescriptorEx(b, format, cudnnDataType(bias->dataType()), numDims, bShape.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor for bias failed", err);
err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnAddTensor bias failed", err);
}
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("conv3dCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
NDArray::registerSpecialUse({output}, {input, weights, bias});
}
//////////////////////////////////////////////////////////////////////////
static void conv3dBpCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* gradO,
NDArray* gradI, NDArray* gradW, NDArray* gradB,
const int kD, const int kH, const int kW,
const int sD, const int sH, const int sW,
const int pD, const int pH, const int pW,
const int dD, const int dH, const int dW,
const int paddingMode, const bool isNCDHW) {
const int numDims = 5;
int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: can't set stream for cuDNN", err);
const std::vector<int> pads = {pD, pH, pW};
const std::vector<int> filtStrides = {sD, sH, sW};
const std::vector<int> dilations = {dD, dH, dW};
const std::vector<int> xShape = {bS, iC, iD, iH, iW};
const std::vector<int> dzShape = {bS, oC, oD, oH, oW};
const std::vector<int> wShape = {oC, iC, kD, kH, kW};
const std::vector<int> dbShape = {1, (int)(isNCDHW ? oC : 1), 1, 1, (int)(isNCDHW ? 1 : oC)};
const std::vector<int> xStrides = {(int)input->strideAt(0), (int)input->strideAt(1), (int)input->strideAt(2), (int)input->strideAt(3), (int)input->strideAt(4)};
const std::vector<int> dxStrides = {(int)gradI->strideAt(0), (int)gradI->strideAt(1), (int)gradI->strideAt(2), (int)gradI->strideAt(3), (int)gradI->strideAt(4)};
const std::vector<int> dzStrides = {(int)gradO->strideAt(0), (int)gradO->strideAt(1), (int)gradO->strideAt(2), (int)gradO->strideAt(3), (int)gradO->strideAt(4)};
cudnnTensorFormat_t format = isNCDHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
else
err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
// gradO descriptor
cudnnTensorDescriptor_t dz;
cudnnCreateTensorDescriptor(&dz);
if(gradO->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(dz, format, cudnnDataType(gradO->dataType()), numDims, dzShape.data());
else
err = cudnnSetTensorNdDescriptor(dz, cudnnDataType(gradO->dataType()), numDims, dzShape.data(), dzStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
// gradI descriptor
cudnnTensorDescriptor_t dx;
cudnnCreateTensorDescriptor(&dx);
if(gradI->ews() == 1)
err = cudnnSetTensorNdDescriptorEx(dx, format, cudnnDataType(gradI->dataType()), numDims, xShape.data());
else
err = cudnnSetTensorNdDescriptor(dx, cudnnDataType(gradI->dataType()), numDims, xShape.data(), dxStrides.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
// gradW descriptor
cudnnFilterDescriptor_t dw;
cudnnCreateFilterDescriptor(&dw);
err = cudnnSetFilterNdDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
if(err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetFilterNdDescriptor failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
// gradW algorithm description
cudnnConvolutionBwdFilterAlgo_t algoGradW;
err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
// gradI algorithm description
cudnnConvolutionBwdDataAlgo_t algoGradI;
err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
// allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
size_t wsGradWSize;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
void* wsGradWData;
auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
// allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
size_t wsGradISize;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
void* wsGradIData;
cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
// run calculation for gradB (if not nullptr)
if(gradB != nullptr) {
cudnnTensorDescriptor_t db;
cudnnCreateTensorDescriptor(&db);
err = cudnnSetTensorNdDescriptorEx(db, format, cudnnDataType(gradB->dataType()), numDims, dbShape.data());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor for gradB failed", err);
err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
}
// run calculation for gradW
err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
// run calculation for gradI
err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardData failed", err);
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("conv3dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsGradWData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
cudaErr = cudaFree(wsGradIData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv3dnew, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto output = OUTPUT_VARIABLE(0); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)
REQUIRE_TRUE(input->rankOf() == 5, 0, "CONV3D CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
int sD = INT_ARG(3); // strides depth
int sH = INT_ARG(4); // strides height
int sW = INT_ARG(5); // strides width
int pD = INT_ARG(6); // paddings depth
int pH = INT_ARG(7); // paddings height
int pW = INT_ARG(8); // paddings width
int dD = INT_ARG(9); // dilations depth
int dH = INT_ARG(10); // dilations height
int dW = INT_ARG(11); // dilations width
int paddingMode = INT_ARG(12); // 0-SAME, 1-VALID
int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "CONV3D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
if (bias)
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support only two formats {oC,iC,kH,kW} and {oC,kH,kW,iC}
newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC --> oC, iC, kD, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = nullptr;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
conv3dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW, paddingMode, isNCDHW);
if(newInput != input)
delete newInput;
delete newWeights;
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(conv3dnew, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
int paddingMode = INT_ARG(12); // 0-SAME, 1-VALID
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv3dnew_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
auto gradI = OUTPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW), epsilon
auto gradW = OUTPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC]
REQUIRE_TRUE(input->rankOf() == 5, 0, "CONV3D_BP CUDNN OP: rank of input array must be equal to 5, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 5, 0, "CONV3D_BP CUDNN OP: rank of weights array must be equal to 5, but got %i instead !", weights->rankOf());
REQUIRE_TRUE(gradO->rankOf() == 5, 0, "CONV3D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 5, but got %i instead !", gradO->rankOf());
int kD = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) depth
int kH = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) height
int kW = INT_ARG(2) > 0 ? INT_ARG(2) : static_cast<int>(weights->sizeAt(2));// filter(kernel) width
int sD = INT_ARG(3); // strides depth
int sH = INT_ARG(4); // strides height
int sW = INT_ARG(5); // strides width
int pD = INT_ARG(6); // paddings depth
int pH = INT_ARG(7); // paddings height
int pW = INT_ARG(8); // paddings width
int dD = INT_ARG(9); // dilations depth
int dH = INT_ARG(10); // dilations height
int dW = INT_ARG(11); // dilations width
int paddingMode = INT_ARG(12); // 1-SAME, 0-VALID
int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
int bS, iC, iD, iH, iW, oC, oD, oH, oW; // batch size, input channels, input depth/height/width, output channels, output depth/height/width;
int indIOioC, indIOioD, indWoC, indWiC, indWkD; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv3d(isNCDHW, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD);
int trueoD, trueoH, trueoW; // true output depth/height/width
ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, paddingMode);
REQUIRE_TRUE(paddingMode < 2, 0, "CONV3D_BP CUDNN OP: causal padding mode (paddingMode = 2) is not allowed for this operation !");
std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoD,trueoH,trueoW, 0,indIOioC,indIOioD,indIOioD+1,indIOioD+2});
std::vector<Nd4jLong> expectedWeightsShape = {kD, kH, kW, iC, oC};
REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0, "CONV3D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
REQUIRE_TRUE(gradW->isSameShape(expectedWeightsShape), 0, "CONV3D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
if(bias)
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "CONV3D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW, paddingMode);
NDArray* newGradW = new NDArray(gradW->ordering(), {oC, iC, kD, kH, kW}, gradW->dataType(), gradW->getContext()); // cudnn support only two formats for weights {oC,iC,kH,kW} and {oC,kH,kW,iC}
NDArray* newWeights = new NDArray(weights->ordering(), {oC, iC, kD, kH, kW}, weights->dataType(), weights->getContext());
newWeights->assign(weights->permute({4,3,0,1,2})); // permute weights (kD, kH, kW, iC, oC --> oC, iC, kD, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = gradI;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv3dCUDNNPadAsymmetric(newInput, newGradI, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, isNCDHW);
conv3dBpCUDNN(block.launchContext(), newInput, newWeights, gradO, newGradI, newGradW, gradB, kD,kH,kW,sD,sH,sW,pD,pH,pW,dD,dH,dW,paddingMode,isNCDHW);
newGradW->permutei({2,3,4,1,0}); // [oC, iC, kD, kH, kW] -> [kD, kH, kW, iC, oC]
gradW->assign(newGradW);
if(newInput != input) {
if(isNCDHW)
gradI->assign((*newGradI)({0,0, 0,0, 0,gradI->sizeAt(2), 0,gradI->sizeAt(3), 0,gradI->sizeAt(4)}));
else
gradI->assign((*newGradI)({0,0, 0,gradI->sizeAt(1), 0,gradI->sizeAt(2), 0,gradI->sizeAt(3), 0,0}));
delete newInput;
delete newGradI;
}
delete newWeights;
delete newGradW;
return Status::OK();
}
PLATFORM_CHECK(conv3dnew_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW), epsilon_next
int paddingMode = INT_ARG(12); // 1-SAME, 0-VALID
int isNCDHW = block.getIArguments()->size() > 13 ? !INT_ARG(13) : 1; // INT_ARG(13): 1-NDHWC, 0-NCDHW
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badGradOType = gradO->dataType() != DataType::DOUBLE && gradO->dataType() != DataType::FLOAT32 && gradO->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return isNCDHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
}
}
}
}

View File

@ -0,0 +1,158 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef SD_CUDNNUTILS_H
#define SD_CUDNNUTILS_H
#include <ops/declarable/PlatformHelper.h>
#include <ops/declarable/OpRegistrator.h>
#include <platform_boilerplate.h>
#include <exceptions/cuda_exception.h>
#include <exceptions/datatype_exception.h>
#include <dll.h>
#include <cudnn.h>
namespace nd4j {
namespace ops {
namespace platforms {
DECLARE_PLATFORM(conv2d, ENGINE_CUDA);
DECLARE_PLATFORM(conv2d_bp, ENGINE_CUDA);
DECLARE_PLATFORM(conv3dnew, ENGINE_CUDA);
DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CUDA);
DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CUDA);
DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CUDA);
DECLARE_PLATFORM(batchnorm, ENGINE_CUDA);
DECLARE_PLATFORM(batchnorm_bp, ENGINE_CUDA);
//////////////////////////////////////////////////////////////////////////
FORCEINLINE cudnnDataType_t cudnnDataType(nd4j::DataType dataType) {
switch (dataType) {
case nd4j::DataType::FLOAT32:
return CUDNN_DATA_FLOAT;
case nd4j::DataType::DOUBLE:
return CUDNN_DATA_DOUBLE;
case nd4j::DataType::HALF:
return CUDNN_DATA_HALF;
case nd4j::DataType::INT32:
return CUDNN_DATA_INT32;
case nd4j::DataType::INT8:
return CUDNN_DATA_INT8;
default:
throw datatype_exception::build("Unsupported data type", dataType);
}
}
//////////////////////////////////////////////////////////////////////////
FORCEINLINE void checkConv2dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
const int iH, const int iW,
const int oH, const int oW,
const int kH, const int kW,
const int sH, const int sW,
const int pH, const int pW,
const int dH, const int dW,
const bool isNCHW) {
const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
const bool isPHasymm = pH != (pHsum - pH);
const bool isPWasymm = pW != (pWsum - pW);
if(!isPHasymm && !isPWasymm)
return;
std::vector<Nd4jLong> newShape = input->getShapeAsVector();
const int iHposition = isNCHW ? 2 : 1;
if(isPHasymm)
newShape[iHposition] += 1;
if(isPWasymm)
newShape[iHposition + 1] += 1;
NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
if(isNCHW)
(*newInput)({0,0, 0,0, 0,input->sizeAt(2), 0,input->sizeAt(3)}).assign(input);
else
(*newInput)({0,0, 0,input->sizeAt(1), 0,input->sizeAt(2), 0,0}).assign(input);
input = newInput;
if(gradI != nullptr)
gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
}
//////////////////////////////////////////////////////////////////////////
FORCEINLINE void checkConv3dCUDNNPadAsymmetric(NDArray* &input, NDArray* &gradI,
const int iD, const int iH, const int iW,
const int oD, const int oH, const int oW,
const int kD, const int kH, const int kW,
const int sD, const int sH, const int sW,
const int pD, const int pH, const int pW,
const int dD, const int dH, const int dW,
const bool isNCDHW) {
const auto pDsum = ((oD - 1) * sD + ((kD - 1) * dD + 1) - iD);
const auto pHsum = ((oH - 1) * sH + ((kH - 1) * dH + 1) - iH);
const auto pWsum = ((oW - 1) * sW + ((kW - 1) * dW + 1) - iW);
const bool isPDasymm = pD != (pDsum - pD);
const bool isPHasymm = pH != (pHsum - pH);
const bool isPWasymm = pW != (pWsum - pW);
if(!isPDasymm && !isPHasymm && !isPWasymm)
return;
std::vector<Nd4jLong> newShape = input->getShapeAsVector();
const int iDposition = isNCDHW ? 2 : 1;
if(isPDasymm)
newShape[iDposition] += 1;
if(isPHasymm)
newShape[iDposition + 1] += 1;
if(isPWasymm)
newShape[iDposition + 2] += 1;
NDArray* newInput = new NDArray(input->ordering(), newShape, input->dataType(), input->getContext());
if(isNCDHW)
(*newInput)({0,0, 0,0, 0,input->sizeAt(2), 0,input->sizeAt(3), 0,input->sizeAt(4)}).assign(input);
else
(*newInput)({0,0, 0,input->sizeAt(1), 0,input->sizeAt(2), 0,input->sizeAt(3), 0,0}).assign(input);
input = newInput;
if(gradI != nullptr)
gradI = new NDArray(gradI->ordering(), newShape, gradI->dataType(), gradI->getContext());
}
}
}
}
#endif //SD_CUDNNUTILS_H

View File

@ -0,0 +1,443 @@
/*******************************************************************************
* Copyright (c) 2019 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include "cudnnUtils.h"
#include <ops/declarable/helpers/convolutions.h>
namespace nd4j {
namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////////
static void depthwiseConv2dCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output,
const int kH, const int kW,
const int sH, const int sW,
const int pH, const int pW,
const int dH, const int dW,
const int paddingMode, const bool isNCHW) {
// cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
// input [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
// weights [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
// bias [oC], may be nullptr
// output [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
// oC = iC*mC
int bS, iC, iH, iW, mC, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
mC = weights->sizeAt(1);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: can't set stream for cuDNN", err);
cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
// weights descriptor
cudnnFilterDescriptor_t w;
cudnnCreateFilterDescriptor(&w);
err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
// output descriptor
cudnnTensorDescriptor_t z;
cudnnCreateTensorDescriptor(&z);
if(output->ews() == 1)
err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
else
err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
err = cudnnSetConvolutionGroupCount(conv, iC); // set number of groups (depthwise mode) in description of convolution, groupCount == iC
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolutionGroupCount failed", err);
// algorithm description
cudnnConvolutionFwdAlgo_t algo;
err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
// allocate auxiliary device memory, abbreviation ws means workspace
size_t wsSize;
err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
void* wsData;
auto cudaErr = cudaMalloc(&wsData, wsSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = output->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({output}, {input, weights, bias});
// run calculation
err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnConvolutionForward failed", err);
// add bias if it is present
if (bias != nullptr) {
cudnnTensorDescriptor_t b;
cudnnCreateTensorDescriptor(&b);
err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnAddTensor bias failed", err);
}
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("depthwiseConv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
NDArray::registerSpecialUse({output}, {input, weights, bias});
}
//////////////////////////////////////////////////////////////////////////
static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
const NDArray* input, const NDArray* weights, const NDArray* gradO,
NDArray* gradI, NDArray* gradW, NDArray* gradB,
const int kH, const int kW,
const int sH, const int sW,
const int pH, const int pW,
const int dH, const int dW,
const int paddingMode, const bool isNCHW) {
// cudnn supports only following case: mC = 1, oC = iC (groupCount == iC)
// input, gradI [bS, iC, iH, iW] nchw or [bS, iH, iW, iC] nhwc
// weights, gradW [iC, mC, kH, kW], mkl doesn't support this format, so we'll make permute
// gradB [oC], may be nullptr
// gradO [bS, oC, oH, oW] nchw or [bS, oH, oW, oC] nhwc
// oC = iC*mC
int bS, iC, iH, iW, mC, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
mC = weights->sizeAt(1);
auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: can't set stream for cuDNN", err);
cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
// input descriptor
cudnnTensorDescriptor_t x;
cudnnCreateTensorDescriptor(&x);
if(input->ews() == 1)
err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
// gradO descriptor
cudnnTensorDescriptor_t dz;
cudnnCreateTensorDescriptor(&dz);
if(gradO->ews() == 1)
err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
else
err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
// gradI descriptor
cudnnTensorDescriptor_t dx;
cudnnCreateTensorDescriptor(&dx);
if(gradI->ews() == 1)
err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
else
err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
// gradW descriptor
cudnnFilterDescriptor_t dw;
cudnnCreateFilterDescriptor(&dw);
err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
// description of convolution
cudnnConvolutionDescriptor_t conv;
cudnnCreateConvolutionDescriptor(&conv);
err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
err = cudnnSetConvolutionGroupCount(conv, iC); // set number of groups (depthwise mode) in description of convolution, groupCount == iC
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolutionGroupCount failed", err);
// gradW algorithm description
cudnnConvolutionBwdFilterAlgo_t algoGradW;
err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
// gradI algorithm description
cudnnConvolutionBwdDataAlgo_t algoGradI;
err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
// allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
size_t wsGradWSize;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
void* wsGradWData;
auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
// allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
size_t wsGradISize;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
void* wsGradIData;
cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
// provide scaling parameters
const float alpha32(1), beta32(0);
const double alpha64(1), beta64(0);
const void* alpha = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&alpha32) : reinterpret_cast<const void*>(&alpha64);
const void* beta = gradO->sizeOfT() <= 4 ? reinterpret_cast<const void*>(&beta32) : reinterpret_cast<const void*>(&beta64);
NDArray::prepareSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
// run calculation for gradB (if not nullptr)
if(gradB != nullptr) {
cudnnTensorDescriptor_t db;
cudnnCreateTensorDescriptor(&db);
err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
}
// run calculation for gradW
err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
// run calculation for gradI
err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
// cudaErr = cudaStreamSynchronize(*context->getCudaStream());
// if (cudaErr != 0)
// throw cuda_exception::build("depthwiseConv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
cudaErr = cudaFree(wsGradWData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
cudaErr = cudaFree(wsGradIData);
if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(depthwise_conv2d, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] = iC*mC
auto output = OUTPUT_VARIABLE(0); // [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
REQUIRE_TRUE(input->rankOf() == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
int sH = INT_ARG(2); // strides height
int sW = INT_ARG(3); // strides width
int pH = INT_ARG(4); // paddings height
int pW = INT_ARG(5); // paddings width
int dH = INT_ARG(6); // dilations height
int dW = INT_ARG(7); // dilations width
int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME
int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
int bS, iC, iH, iW, mC, oC, oH, oW; // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
mC = weights->sizeAt(indWmC); // channels multiplier
ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
REQUIRE_TRUE(output->sizeAt(indIOioC) == iC*mC, 0, "DEPTHWISECONV2D CUDNN OP: the output_channels must be equal to input_channels * channels_multiplier = %i !", iC*mC);
if (bias)
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext()); // cudnn support format {oC, iC/groupCount, kH, kW}
newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC --> iC, mC, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = nullptr;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
depthwiseConv2dCUDNN(block.launchContext(), newInput, newWeights, bias, output, kH,kW,sH,sW,pH,pW,dH,dW, paddingMode, isNCHW);
if(newInput != input)
delete newInput;
delete newWeights;
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(depthwise_conv2d, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC] = iC*mC
const int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME, 2-CAUSAL
const int mC = weights->sizeAt(3);
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return mC == 1 && paddingMode != 2 && !badInputType && !badWeightsType && !badBiasType;
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] = [iC*mC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
auto gradI = OUTPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
auto gradW = OUTPUT_VARIABLE(1); // [kH, kW, iC, mC] always
auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr; // [oC]
REQUIRE_TRUE(input->rankOf() == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of input array must be equal to 4, but got %i instead !", input->rankOf());
REQUIRE_TRUE(weights->rankOf() == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of weights array must be equal to 4, but got %i instead !", weights->rankOf());
REQUIRE_TRUE(gradO->rankOf() == 4, 0, "DEPTHWISECONV2D_BP CUDNN OP: rank of output gradients (next epsilon) array must be equal to 4, but got %i instead !", gradO->rankOf());
int kH = INT_ARG(0) > 0 ? INT_ARG(0) : static_cast<int>(weights->sizeAt(0));// filter(kernel) height
int kW = INT_ARG(1) > 0 ? INT_ARG(1) : static_cast<int>(weights->sizeAt(1));// filter(kernel) width
int sH = INT_ARG(2); // strides height
int sW = INT_ARG(3); // strides width
int pH = INT_ARG(4); // paddings height
int pW = INT_ARG(5); // paddings width
int dH = INT_ARG(6); // dilations height
int dW = INT_ARG(7); // dilations width
int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME
int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 1-NHWC, 0-NCHW
int bS, iC, iH, iW, mC, oC, oH, oW; // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH; // corresponding indexes
ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
mC = weights->sizeAt(indWmC); // channels multiplier
int trueoH, trueoW; // correct output height, width
ConvolutionUtils::calcOutSizePool2D(trueoH, trueoW, kH, kW, sH, sW, pH, pW, dH, dW, iH, iW, paddingMode);
ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
std::vector<Nd4jLong> expectedGradOShape = ShapeUtils::composeShapeUsingDimsAndIdx({bS,oC,trueoH,trueoW, 0,indIOioC,indOoH,indOoH+1});
std::vector<Nd4jLong> expectedWeightsShape = {kH, kW, iC, mC};
REQUIRE_TRUE(gradO->isSameShape(expectedGradOShape), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of output gradients (next epsilon) array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedGradOShape).c_str(), ShapeUtils::shapeAsString(gradO).c_str());
REQUIRE_TRUE(weights->isSameShape(expectedWeightsShape), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of weights array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedWeightsShape).c_str(), ShapeUtils::shapeAsString(weights).c_str());
if(bias)
REQUIRE_TRUE(bias->rankOf() <= 2 && oC == bias->lengthOf(), 0, "DEPTHWISECONV2D_BP CUDNN OP: wrong shape of array with biases, expected rank, length: <=2, %i, but got %i, %i instead !", oC, bias->rankOf(), bias->lengthOf());
NDArray* newGradW = new NDArray(gradW->ordering(), {iC, mC, kH, kW}, gradW->dataType(), gradW->getContext()); // cudnn support format {oC, iC/groupCount, kH, kW}
NDArray* newWeights = new NDArray(weights->ordering(), {iC, mC, kH, kW}, weights->dataType(), weights->getContext());
newWeights->assign(weights->permute({2,3,0,1})); // assign permuted weights (kH, kW, iC, mC --> iC, mC, kH, kW)
NDArray* newInput = input;
NDArray* newGradI = gradI;
if(paddingMode == 1) // in same paddingMode cudnn doesn't support asymmetric left/right top/bottopm paddings
checkConv2dCUDNNPadAsymmetric(newInput, newGradI, iH, iW, oH, oW, kH, kW, sH, sW, pH, pW, dH, dW, isNCHW);
depthwiseConv2dBpCUDNN(block.launchContext(), newInput, newWeights, gradO, newGradI, newGradW, gradB, kH,kW,sH,sW,pH,pW,dH,dW,paddingMode,isNCHW);
newGradW->permutei({2,3,0,1}); // [iC, mC, kH, kW] -> [kH, kW, iC, mC]
gradW->assign(newGradW);
if(newInput != input) {
if(isNCHW)
gradI->assign((*newGradI)({0,0, 0,0, 0,gradI->sizeAt(2), 0,gradI->sizeAt(3)}));
else
gradI->assign((*newGradI)({0,0, 0,gradI->sizeAt(1), 0,gradI->sizeAt(2), 0,0}));
delete newInput;
delete newGradI;
}
delete newWeights;
delete newGradW;
return Status::OK();
}
PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CUDA) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC] = [iC*mC]
auto gradO = block.width() > 3 ? INPUT_VARIABLE(3) : INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
const int paddingMode = INT_ARG(8); // 0-VALID, 1-SAME, 2-CAUSAL
const int isNCHW = block.getIArguments()->size() > 9 ? !INT_ARG(9) : 1; // INT_ARG(9): 0-NCHW, 1-NHWC
const int mC = weights->sizeAt(3);
const bool badInputType = input->dataType() != DataType::DOUBLE && input->dataType() != DataType::FLOAT32 && input->dataType() != DataType::HALF;
const bool badWeightsType = weights->dataType() != DataType::DOUBLE && weights->dataType() != DataType::FLOAT32 && weights->dataType() != DataType::HALF;
const bool badGradOType = gradO->dataType() != DataType::DOUBLE && gradO->dataType() != DataType::FLOAT32 && gradO->dataType() != DataType::HALF;
const bool badBiasType = bias == nullptr ? false : (bias->dataType() != DataType::DOUBLE && bias->dataType() != DataType::FLOAT32 && bias->dataType() != DataType::HALF);
return mC == 1 && isNCHW && paddingMode != 2 && !badInputType && !badWeightsType && !badGradOType && !badBiasType;
}
}
}
}

View File

@ -28,11 +28,12 @@
#include <ops/declarable/helpers/convolutions.h>
using namespace dnnl;
using namespace samediff;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(avgpool2d) {
PLATFORM_IMPL(avgpool2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@ -128,7 +129,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(avgpool2d) {
PLATFORM_CHECK(avgpool2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -32,7 +32,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(avgpool2d_bp) {
PLATFORM_IMPL(avgpool2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto gradO = INPUT_VARIABLE(
@ -138,7 +138,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(avgpool2d_bp) {
PLATFORM_CHECK(avgpool2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -32,7 +32,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(avgpool3dnew) {
PLATFORM_IMPL(avgpool3dnew, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto output = OUTPUT_VARIABLE(
@ -130,7 +130,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(avgpool3dnew) {
PLATFORM_CHECK(avgpool3dnew, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -31,7 +31,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(avgpool3dnew_bp) {
PLATFORM_IMPL(avgpool3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto gradO = INPUT_VARIABLE(
@ -143,7 +143,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(avgpool3dnew_bp) {
PLATFORM_CHECK(avgpool3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -375,7 +375,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
*dLdI += xMinusMean;
}
PLATFORM_IMPL(batchnorm) {
PLATFORM_IMPL(batchnorm, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw
auto mean = INPUT_VARIABLE(1); // [c]
@ -455,7 +455,7 @@ PLATFORM_IMPL(batchnorm) {
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(batchnorm) {
PLATFORM_CHECK(batchnorm, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
// if (::optimalLevel() < 2)
// return false;
@ -632,7 +632,7 @@ PLATFORM_CHECK(batchnorm) {
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(batchnorm_bp) {
PLATFORM_IMPL(batchnorm_bp, ENGINE_CPU) {
NDArray* input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw
NDArray* mean = INPUT_VARIABLE(1); // [c]
@ -735,7 +735,7 @@ PLATFORM_IMPL(batchnorm_bp) {
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(batchnorm_bp) {
PLATFORM_CHECK(batchnorm_bp, ENGINE_CPU) {
NDArray* input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw
NDArray* mean = INPUT_VARIABLE(1); // [c]
NDArray* variance = INPUT_VARIABLE(2); // [c]

View File

@ -113,7 +113,7 @@ static void conv2d_mkldnn(nd4j::graph::Context &block, const NDArray *input, con
}
//////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv2d) {
PLATFORM_IMPL(conv2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr; // [oC]
@ -137,7 +137,7 @@ PLATFORM_IMPL(conv2d) {
return Status::OK();
}
PLATFORM_CHECK(conv2d) {
PLATFORM_CHECK(conv2d, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
if (::optimalLevel() < 2)
return false;
@ -151,7 +151,7 @@ PLATFORM_CHECK(conv2d) {
}
//////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv2d_bp) {
PLATFORM_IMPL(conv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]
@ -328,7 +328,7 @@ PLATFORM_IMPL(conv2d_bp) {
return Status::OK();
}
PLATFORM_CHECK(conv2d_bp) {
PLATFORM_CHECK(conv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]

View File

@ -34,7 +34,7 @@ namespace ops {
namespace platforms {
//////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv3dnew) {
PLATFORM_IMPL(conv3dnew, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, iC, oC] always
@ -150,7 +150,7 @@ PLATFORM_IMPL(conv3dnew) {
return Status::OK();
}
PLATFORM_CHECK(conv3dnew) {
PLATFORM_CHECK(conv3dnew, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
if (::optimalLevel() < 2)
return false;
@ -167,7 +167,7 @@ PLATFORM_CHECK(conv3dnew) {
//////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(conv3dnew_bp) {
PLATFORM_IMPL(conv3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(
@ -374,7 +374,7 @@ PLATFORM_IMPL(conv3dnew_bp) {
return Status::OK();
}
PLATFORM_CHECK(conv3dnew_bp) {
PLATFORM_CHECK(conv3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(

View File

@ -349,7 +349,7 @@ static void deconv2dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(deconv2d) {
PLATFORM_IMPL(deconv2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, oC, iC] always
@ -406,7 +406,7 @@ PLATFORM_IMPL(deconv2d) {
return Status::OK();
}
PLATFORM_CHECK(deconv2d) {
PLATFORM_CHECK(deconv2d, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
// if (::optimalLevel() < 2)
// return false;
@ -435,7 +435,7 @@ PLATFORM_CHECK(deconv2d) {
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(deconv2d_bp) {
PLATFORM_IMPL(deconv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, oC, iC] always
@ -506,7 +506,7 @@ PLATFORM_IMPL(deconv2d_bp) {
return Status::OK();
}
PLATFORM_CHECK(deconv2d_bp) {
PLATFORM_CHECK(deconv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, oC, iC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]

View File

@ -145,7 +145,7 @@ static void deconv2TFdBackPropMKLDNN(const NDArray* weights, const NDArray* grad
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(deconv2d_tf) {
PLATFORM_IMPL(deconv2d_tf, ENGINE_CPU) {
auto gradO = INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
@ -222,7 +222,7 @@ PLATFORM_IMPL(deconv2d_tf) {
return Status::OK();
}
PLATFORM_CHECK(deconv2d_tf) {
PLATFORM_CHECK(deconv2d_tf, ENGINE_CPU) {
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, oC] always
auto gradO = INPUT_VARIABLE(2); // [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
auto gradI = OUTPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCDHW), gradI

View File

@ -360,7 +360,7 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(deconv3d) {
PLATFORM_IMPL(deconv3d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, oC, iC] always
@ -421,7 +421,7 @@ PLATFORM_IMPL(deconv3d) {
return Status::OK();
}
PLATFORM_CHECK(deconv3d) {
PLATFORM_CHECK(deconv3d, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
// if (::optimalLevel() < 2)
// return false;
@ -451,7 +451,7 @@ PLATFORM_CHECK(deconv3d) {
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(deconv3d_bp) {
PLATFORM_IMPL(deconv3d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, oC, iC] always
@ -525,7 +525,7 @@ PLATFORM_IMPL(deconv3d_bp) {
}
PLATFORM_CHECK(deconv3d_bp) {
PLATFORM_CHECK(deconv3d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iD, iH, iW, iC] (NHWC) or [bS, iD, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kD, kH, kW, oC, iC] always
auto bias = block.width() > 3 ? INPUT_VARIABLE(2) : nullptr; // [oC]

View File

@ -362,7 +362,7 @@ static void depthwiseConv2dNackPropMKLDNN(const NDArray* input, const NDArray* w
//////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(depthwise_conv2d) {
PLATFORM_IMPL(depthwise_conv2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
@ -400,7 +400,7 @@ PLATFORM_IMPL(depthwise_conv2d) {
}
//////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(depthwise_conv2d) {
PLATFORM_CHECK(depthwise_conv2d, ENGINE_CPU) {
// we don't want to use mkldnn if cpu doesn't support avx/avx2
if (::optimalLevel() < 2)
return false;
@ -427,7 +427,7 @@ PLATFORM_CHECK(depthwise_conv2d) {
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(depthwise_conv2d_bp) {
PLATFORM_IMPL(depthwise_conv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always
@ -476,7 +476,7 @@ PLATFORM_IMPL(depthwise_conv2d_bp) {
}
//////////////////////////////////////////////////////////////////////
PLATFORM_CHECK(depthwise_conv2d_bp) {
PLATFORM_CHECK(depthwise_conv2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0); // [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
auto weights = INPUT_VARIABLE(1); // [kH, kW, iC, mC] always

View File

@ -32,7 +32,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(lrn) {
PLATFORM_IMPL(lrn, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);
@ -82,7 +82,7 @@ namespace nd4j {
return Status::OK();
};
PLATFORM_CHECK(lrn) {
PLATFORM_CHECK(lrn, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -365,7 +365,7 @@ static void lstmLayerMKLDNN(const NDArray* x, const NDArray* Wx, const NDArray*
}
//////////////////////////////////////////////////////////////////////////
PLATFORM_IMPL(lstmLayer) {
PLATFORM_IMPL(lstmLayer, ENGINE_CPU) {
const auto dataFormat = INT_ARG(0); // for unidirectional: 0 = [sL, bS, nIn], 1 = [bS, sL ,nIn], 2 = [bS, nIn, sL], for bidirectional: 3 = [sL, 2, bS, nOut] (for ONNX)
const auto directionMode = INT_ARG(1); // direction: 0 = fwd, 1 = bwd, 2 = bidirectional sum, 3 = bidirectional concat, 4 = bidirectional extra output dim (in conjunction with format dataFormat = 3)
@ -493,7 +493,7 @@ PLATFORM_IMPL(lstmLayer) {
return Status::OK();
}
PLATFORM_CHECK(lstmLayer) {
PLATFORM_CHECK(lstmLayer, ENGINE_CPU) {
const auto hasBiases = B_ARG(0); // indicates whether biases array is provided
const auto hasInitH = B_ARG(2); // indicates whether initial output is provided
const auto hasInitC = B_ARG(3); // indicates whether initial cell state is provided

View File

@ -32,7 +32,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(maxpool2d) {
PLATFORM_IMPL(maxpool2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
REQUIRE_TRUE(input->rankOf() == 4, 0, "Input should have rank of 4, but got %i instead",
@ -134,7 +134,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(maxpool2d) {
PLATFORM_CHECK(maxpool2d, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -32,7 +32,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(maxpool2d_bp) {
PLATFORM_IMPL(maxpool2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
auto gradO = INPUT_VARIABLE(
@ -163,7 +163,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(maxpool2d_bp) {
PLATFORM_CHECK(maxpool2d_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -31,7 +31,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(maxpool3dnew) {
PLATFORM_IMPL(maxpool3dnew, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto output = OUTPUT_VARIABLE(
@ -140,7 +140,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(maxpool3dnew) {
PLATFORM_CHECK(maxpool3dnew, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -31,7 +31,7 @@ using namespace dnnl;
namespace nd4j {
namespace ops {
namespace platforms {
PLATFORM_IMPL(maxpool3dnew_bp) {
PLATFORM_IMPL(maxpool3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(
0); // [bS, iD, iH, iW, iC] (NDHWC) or [bS, iC, iD, iH, iW] (NCDHW)
auto gradO = INPUT_VARIABLE(
@ -170,7 +170,7 @@ namespace nd4j {
return Status::OK();
}
PLATFORM_CHECK(maxpool3dnew_bp) {
PLATFORM_CHECK(maxpool3dnew_bp, ENGINE_CPU) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);

View File

@ -29,6 +29,8 @@
#include <ops/declarable/PlatformHelper.h>
#include <platform_boilerplate.h>
using namespace samediff;
namespace nd4j{
namespace ops {
@ -36,50 +38,51 @@ namespace nd4j{
/**
* Here we actually declare our platform helpers
*/
DECLARE_PLATFORM(conv2d);
DECLARE_PLATFORM(conv2d, ENGINE_CPU);
DECLARE_PLATFORM(conv2d_bp);
DECLARE_PLATFORM(conv2d_bp, ENGINE_CPU);
DECLARE_PLATFORM(avgpool2d);
DECLARE_PLATFORM(avgpool2d, ENGINE_CPU);
DECLARE_PLATFORM(avgpool2d_bp);
DECLARE_PLATFORM(avgpool2d_bp, ENGINE_CPU);
DECLARE_PLATFORM(maxpool2d);
DECLARE_PLATFORM(maxpool2d, ENGINE_CPU);
DECLARE_PLATFORM(maxpool2d_bp);
DECLARE_PLATFORM(maxpool2d_bp, ENGINE_CPU);
DECLARE_PLATFORM(conv3dnew);
DECLARE_PLATFORM(conv3dnew, ENGINE_CPU);
DECLARE_PLATFORM(conv3dnew_bp);
DECLARE_PLATFORM(conv3dnew_bp, ENGINE_CPU);
DECLARE_PLATFORM(maxpool3dnew);
DECLARE_PLATFORM(maxpool3dnew, ENGINE_CPU);
DECLARE_PLATFORM(maxpool3dnew_bp);
DECLARE_PLATFORM(maxpool3dnew_bp, ENGINE_CPU);
DECLARE_PLATFORM(avgpool3dnew);
DECLARE_PLATFORM(avgpool3dnew, ENGINE_CPU);
DECLARE_PLATFORM(avgpool3dnew_bp);
DECLARE_PLATFORM(avgpool3dnew_bp, ENGINE_CPU);
DECLARE_PLATFORM(lrn);
DECLARE_PLATFORM(lrn, ENGINE_CPU);
DECLARE_PLATFORM(batchnorm);
DECLARE_PLATFORM(batchnorm, ENGINE_CPU);
DECLARE_PLATFORM(batchnorm_bp);
DECLARE_PLATFORM(batchnorm_bp, ENGINE_CPU);
DECLARE_PLATFORM(lstmLayer);
DECLARE_PLATFORM(lstmLayer, ENGINE_CPU);
DECLARE_PLATFORM(deconv2d);
DECLARE_PLATFORM(deconv2d, ENGINE_CPU);
DECLARE_PLATFORM(deconv2d_tf);
DECLARE_PLATFORM(deconv2d_tf, ENGINE_CPU);
DECLARE_PLATFORM(deconv3d);
DECLARE_PLATFORM(deconv3d, ENGINE_CPU);
DECLARE_PLATFORM(deconv2d_bp);
DECLARE_PLATFORM(deconv2d_bp, ENGINE_CPU);
DECLARE_PLATFORM(deconv3d_bp);
DECLARE_PLATFORM(deconv3d_bp, ENGINE_CPU);
DECLARE_PLATFORM(depthwise_conv2d);
DECLARE_PLATFORM(depthwise_conv2d_bp);
DECLARE_PLATFORM(depthwise_conv2d, ENGINE_CPU);
DECLARE_PLATFORM(depthwise_conv2d_bp, ENGINE_CPU);
}
}

View File

@ -21,25 +21,37 @@
#ifndef SD_PLATFORM_BOILERPLATE_H
#define SD_PLATFORM_BOILERPLATE_H
#include <execution/Engine.h>
#define DECLARE_PLATFORM(NAME) class ND4J_EXPORT PLATFORM_##NAME : public PlatformHelper {\
#define CONCATP(A,B) A ##_##B
#define DECLARE_PLATFORM_F(NAME, ENGINE, CNAME) class ND4J_EXPORT PLATFORM_##CNAME : public PlatformHelper {\
public: \
PLATFORM_##NAME() : PlatformHelper(#NAME) { } \
PLATFORM_##CNAME() : PlatformHelper(#NAME, samediff::Engine::ENGINE) { } \
bool isUsable(graph::Context &context) override; \
Nd4jStatus invokeHelper(graph::Context &context) override; \
};
#define PLATFORM_IMPL(NAME) struct ND4J_EXPORT __registratorPlatformHelper_##NAME { \
__registratorPlatformHelper_##NAME() { \
auto helper = new PLATFORM_##NAME(); \
#define DECLARE_PLATFORM(NAME, ENGINE) DECLARE_PLATFORM_F(NAME, ENGINE, NAME ##_## ENGINE)
#define PLATFORM_IMPL_F(NAME, ENGINE, CNAME) struct ND4J_EXPORT __registratorPlatformHelper_##CNAME { \
__registratorPlatformHelper_##CNAME() { \
auto helper = new PLATFORM_##CNAME(); \
OpRegistrator::getInstance()->registerHelper(helper); \
} \
}; \
static __registratorPlatformHelper_##NAME platformHelper_##NAME; \
Nd4jStatus PLATFORM_##NAME::invokeHelper(nd4j::graph::Context &block)
static __registratorPlatformHelper_##CNAME platformHelper_##CNAME; \
Nd4jStatus PLATFORM_##CNAME::invokeHelper(nd4j::graph::Context &block)
#define PLATFORM_CHECK(NAME) bool PLATFORM_##NAME::isUsable(graph::Context &block)
#define PLATFORM_IMPL(NAME, ENGINE) PLATFORM_IMPL_F(NAME, ENGINE, NAME ##_## ENGINE)
#define PLATFORM_CHECK_F(NAME, ENGINE, CNAME) bool PLATFORM_##CNAME::isUsable(graph::Context &block)
#define PLATFORM_CHECK(NAME, ENGINE) PLATFORM_CHECK_F(NAME, ENGINE, NAME ##_## ENGINE)
#endif //SD_PLATFORM_BOILERPLATE_H

View File

@ -21,8 +21,9 @@
#ifndef LIBND4J_PLAY_H
#define LIBND4J_PLAY_H
#include <type_boilerplate.h>
//#include <type_boilerplate.h>
#include <platform_boilerplate.h>
/*
#define DATA_TYPES \
(DATA_FLOAT, float) ,\
(DATA_DOUBLE, double)
@ -41,6 +42,9 @@
BUILD_SINGLE_TEMPLATE_TWICE(template class functionName, , DATA_TYPES)
*/
DECLARE_PLATFORM(conv2d, ENGINE_CPU)
//BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functionName, (signature), DATA_TYPES, Y_TYPES);

View File

@ -135,12 +135,18 @@ elseif(CUDA_BLAS)
add_executable(runtests ${TEST_SOURCES})
if (WIN32)
message("MSVC runtime for tests: ${MSVC_RT_LIB}")
endif()
# applies to windows only
set_property(TARGET runtests PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
set_property(TARGET gtest PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
set_property(TARGET gtest_main PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} gtest gtest_main)
if (HAVE_CUDNN)
message("CUDNN library: ${CUDNN}")
endif()
target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN} gtest gtest_main)
endif()

File diff suppressed because one or more lines are too long

View File

@ -2416,7 +2416,7 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_9) {
ASSERT_EQ(Status::OK(), results->status());
ASSERT_TRUE(expOutput.isSameShape(output));
ASSERT_TRUE(expOutput.equalsTo(output));
ASSERT_TRUE(expOutput.equalsTo(output, 1e-4));
delete results;
}

View File

@ -0,0 +1,128 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include "testlayers.h"
#include <initializer_list>
#include <NDArrayFactory.h>
#include <ops/declarable/PlatformHelper.h>
#include <ops/declarable/CustomOperations.h>
#include <execution/Engine.h>
#ifdef HAVE_CUDNN
#include <ops/declarable/platform/cudnn/cudnnUtils.h>
#endif
using namespace nd4j;
class CuDnnTests : public testing::Test {
public:
};
static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*> helpers) {
for (auto v:helpers) {
nd4j_printf("Initialized [%s]\n", v->name().c_str());
}
}
TEST_F(CuDnnTests, helpers_includer) {
// we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
#ifdef HAVE_CUDNN
nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CUDA conv2d;
nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CUDA conv2d_bp;
nd4j::ops::platforms::PLATFORM_conv3dnew_ENGINE_CUDA conv3dnew;
nd4j::ops::platforms::PLATFORM_conv3dnew_bp_ENGINE_CUDA conv3dnew_bp;
nd4j::ops::platforms::PLATFORM_depthwise_conv2d_ENGINE_CUDA depthwise_conv2d;
nd4j::ops::platforms::PLATFORM_depthwise_conv2d_bp_ENGINE_CUDA depthwise_conv2d_bp;
nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CUDA batchnorm;
printer({&conv2d});
printer({&conv2d_bp});
printer({&conv3dnew});
printer({&conv3dnew_bp});
printer({&depthwise_conv2d});
printer({&depthwise_conv2d_bp});
printer({&batchnorm});
#endif
}
TEST_F(CuDnnTests, mixed_helpers_test_1) {
#if defined(HAVE_CUDNN) && defined (HAVE_MKLDNN)
nd4j_printf("Mixed platforms test\n", "");
int bS=2, iH=4,iW=3, iC=4,oC=3, kH=3,kW=2, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1;
int oH=2,oW=2;
int paddingMode = 0; // 1-SAME, 0-VALID;
int dataFormat = 0; // 1-NHWC, 0-NCHW
auto input = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
auto weights = NDArrayFactory::create<float>('c', {oC, iC, kH, kW});
auto bias = NDArrayFactory::create<float>('c', {oC}, {1,2,3});
auto expOutput = NDArrayFactory::create<float>('c', {bS, oC, oH, oW}, {61.f, 61.f, 61.f, 61.f, 177.2f, 177.2f, 177.2f, 177.2f, 293.4f, 293.4f, 293.4f, 293.4f, 61.f, 61.f, 61.f, 61.f, 177.2f, 177.2f, 177.2f, 177.2f, 293.4f, 293.4f, 293.4f, 293.4f});
auto zCUDA = expOutput.like();
auto zMKL = expOutput.like();
input = 2.;
weights.linspace(0.1, 0.1);
weights.permutei({2,3,1,0});
input.syncToHost();
weights.syncToHost();
bias.syncToHost();
nd4j::ops::conv2d op;
// cuDNN part
Context cuda(1);
cuda.setTargetEngine(samediff::Engine::ENGINE_CUDA);
cuda.setInputArray(0, &input);
cuda.setInputArray(1, &weights);
cuda.setInputArray(2, &bias);
cuda.setOutputArray(0, &zCUDA);
cuda.setIArguments({kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, dataFormat});
auto statusCUDA = op.execute(&cuda);
ASSERT_EQ(Status::OK(), statusCUDA);
ASSERT_EQ(expOutput, zCUDA);
// MKL-DNN part
Context mkl(1);
mkl.setTargetEngine(samediff::Engine::ENGINE_CPU);
mkl.setInputArray(0, &input);
mkl.setInputArray(1, &weights);
mkl.setInputArray(2, &bias);
mkl.setOutputArray(0, &zMKL);
mkl.setIArguments({kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, dataFormat});
auto statusMKL = op.execute(&mkl);
zMKL.tickWriteHost();
ASSERT_EQ(Status::OK(), statusMKL);
ASSERT_EQ(expOutput, zMKL);
#endif
}

View File

@ -3280,209 +3280,6 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) {
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests10, batchnorm_test1) {
NDArray input ('c', {2,4}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,4}, {11.61218734f, 18.52390321f, -8.67185076f, -21.28716864f, 10.93337162f, 19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test2) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {4});
auto variance = NDArrayFactory::create<TypeParam>('c', {4});
auto gamma = NDArrayFactory::create<TypeParam>('c', {4});
auto beta = NDArrayFactory::create<TypeParam>('c', {4});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
input.linspace(0.1, 0.1);
mean.assign(1.);
variance.assign(0.5);
gamma.assign(1.2);
beta.assign(1.);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test3) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
auto gamma = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
auto beta = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_test4) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
auto gamma = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
auto beta = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests10, batchnorm_test5) {
NDArray input ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,4,2,2}, { 11.612187f, 11.442483f, 11.272779f, 11.103076f, 18.990039f, 19.145418f, 19.300796f, 19.456175f, -9.557284f, -9.704856f, -9.852428f, -10.f, -20.f,
-19.856981f, -19.713963f, -19.570944f, 8.896924f, 8.727221f, 8.557517f, 8.387813f, 21.476097f, 21.631475f, 21.786854f, 21.942233f, -11.918438f,
-12.06601f, -12.213582f, -12.361154f, -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests10, batchnorm_test6) {
NDArray input ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,2,2,4}, {11.612187f, 18.523903f, -8.671851f, -21.287169f, 10.933372f, 19.145418f, -9.262139f, -20.715094f, 10.254556f, 19.766932f, -9.852428f, -20.143019f, 9.57574f,
20.388447f, -10.442716f, -19.570944f, 8.896924f, 21.009961f, -11.033005f, -18.998869f, 8.218109f, 21.631475f, -11.623294f, -18.426794f, 7.539293f, 22.25299f,
-12.213582f, -17.854719f, 6.860477f, 22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests10, batchnorm_test7) {
NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
input2.permutei({0,3,1,2});
NDArray mean ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
input1.linspace(-1012, 1);
input2.assign(input1);
nd4j::ops::batchnorm op;
auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
ASSERT_EQ(ND4J_STATUS_OK, res1);
auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
ASSERT_EQ(ND4J_STATUS_OK, res2);
ASSERT_TRUE(out1.equalsTo(out2));
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests10, bool_broadcast_test_1) {

View File

@ -38,6 +38,19 @@ public:
}
};
template <typename T>
class TypedDeclarableOpsTests13 : public testing::Test {
public:
TypedDeclarableOpsTests13() {
printf("\n");
fflush(stdout);
}
};
typedef ::testing::Types<double, float> TestingTypes;
TYPED_TEST_CASE(TypedDeclarableOpsTests13, TestingTypes);
TEST_F(DeclarableOpsTests13, test_pow_1) {
auto x = NDArrayFactory::create<float>('c', {2, 2}, {2.f, 2.f, 2.f, 2.f});
auto y = NDArrayFactory::create<int>('c', {2}, {3, 3});
@ -1948,3 +1961,289 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test1) {
NDArray input ('c', {2,4}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,4}, {11.61218734f, 18.52390321f, -8.67185076f, -21.28716864f, 10.93337162f, 19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test2) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {4});
auto variance = NDArrayFactory::create<TypeParam>('c', {4});
auto gamma = NDArrayFactory::create<TypeParam>('c', {4});
auto beta = NDArrayFactory::create<TypeParam>('c', {4});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-0.52733537f, -0.35763144f, -0.18792751f, -0.01822358f, 0.15148035f, 0.32118428f, 0.49088821f, 0.66059214f, 0.83029607f, 1.f, 1.16970393f, 1.33940786f,
1.50911179f, 1.67881572f, 1.84851965f, 2.01822358f, 2.18792751f, 2.35763144f, 2.52733537f, 2.6970393f, 2.86674323f, 3.03644717f, 3.2061511f, 3.37585503f});
input.linspace(0.1, 0.1);
mean.assign(1.);
variance.assign(0.5);
gamma.assign(1.2);
beta.assign(1.);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test3) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {3}, {1.05f, 1.1f, 1.15f});
auto variance = NDArrayFactory::create<TypeParam>('c', {3}, {0.5f, 0.6f, 0.7f});
auto gamma = NDArrayFactory::create<TypeParam>('c', {3}, {1.2f, 1.3f, 1.4f});
auto beta = NDArrayFactory::create<TypeParam>('c', {3}, {0.1f, 0.2f, 0.3f});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.34248341f, -1.17277948f, -1.00307555f, -0.80696728f, -0.6391394f, -0.47131152f, -0.30348364f, -0.11832703f, 0.04900378f, 0.21633459f, 0.38366541f,
0.52425983f, 0.69396376f, 0.86366769f, 1.03337162f, 1.20696728f, 1.37479516f, 1.54262304f, 1.71045092f, 1.8896427f, 2.05697351f, 2.22430432f, 2.39163513f});
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test4) {
auto input = NDArrayFactory::create<TypeParam>('c', {2,3,4});
auto mean = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.05f, 1.1f, 1.15f, 1.2f, 1.25f, 1.3f, 1.35f, 1.4f});
auto variance = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.1f, 1.2f});
auto gamma = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f, 1.8f, 1.9f});
auto beta = NDArrayFactory::create<TypeParam>('c', {2,1,4}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.66f, 0.7f, 0.8f});
auto expected = NDArrayFactory::create<TypeParam>('c', {2,3,4}, {-1.51218734f, -1.31045092f, -1.12231189f, -0.9416324f, -0.83337162f, -0.6391394f, -0.45298865f, -0.2708162f, -0.1545559f, 0.03217212f, 0.21633459f, 0.4f,
0.58432694f, 0.82999915f, 0.95743373f, 1.14688951f, 1.25894242f, 1.50999575f, 1.64392367f, 1.84066852f, 1.93355791f, 2.18999235f, 2.33041362f, 2.53444754f});
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test5) {
NDArray input ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,4,2,2}, { 11.612187f, 11.442483f, 11.272779f, 11.103076f, 18.990039f, 19.145418f, 19.300796f, 19.456175f, -9.557284f, -9.704856f, -9.852428f, -10.f, -20.f,
-19.856981f, -19.713963f, -19.570944f, 8.896924f, 8.727221f, 8.557517f, 8.387813f, 21.476097f, 21.631475f, 21.786854f, 21.942233f, -11.918438f,
-12.06601f, -12.213582f, -12.361154f, -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test6) {
NDArray input ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
NDArray variance('c', {4}, {0.5f, 0.7f, 0.9, 1.1f}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {4}, {10.f, 20.f, -10.f, -20.f}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,2,2,4}, {11.612187f, 18.523903f, -8.671851f, -21.287169f, 10.933372f, 19.145418f, -9.262139f, -20.715094f, 10.254556f, 19.766932f, -9.852428f, -20.143019f, 9.57574f,
20.388447f, -10.442716f, -19.570944f, 8.896924f, 21.009961f, -11.033005f, -18.998869f, 8.218109f, 21.631475f, -11.623294f, -18.426794f, 7.539293f, 22.25299f,
-12.213582f, -17.854719f, 6.860477f, 22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
input.linspace(0.1, 0.1);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShapeStrict(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test7) {
NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
input2.permutei({0,3,1,2});
NDArray mean ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
NDArray variance('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {3}, {1, 1, 1}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {3}, {0, 0, 0}, nd4j::DataType::FLOAT32);
NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
input1.linspace(-1012, 1);
input2.assign(input1);
nd4j::ops::batchnorm op;
auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
ASSERT_EQ(ND4J_STATUS_OK, res1);
auto res2 = op.execute({&input2, &mean, &variance, &gamma, &beta}, {&out2}, {1e-5}, {1,1,1}, {});
ASSERT_EQ(ND4J_STATUS_OK, res2);
ASSERT_TRUE(out1.equalsTo(out2));
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test8) {
NDArray input('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
NDArray variance('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,3,4,5}, {-105.019394, -103.322357, -101.625313, -99.928276, -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922,
-82.957886, -81.260841, -79.563805, -77.866768, -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414,
-60.896374, -59.199333, -57.502296, -55.805256, -54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902,
-38.834862, -37.137825, -35.440784, -33.743744, -32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392,
-16.773354, -15.076314, -13.379274, -11.682236, -9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157,
6.985196, 8.682236, 10.379274, 12.076314, 13.773354, 15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744,
32.440784, 34.137825, 35.834862, 37.531902, 39.228943, 40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333,
57.896374, 59.593414, 61.290451, 62.987488, 64.684532, 66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922,
83.351959, 85.049004, 86.746040, 88.443077, 90.140121, 91.837158, 93.534195, 95.231239, 96.928276}, nd4j::DataType::FLOAT32);
input.linspace(-60, 1);
mean.assign(1.);
variance.assign(0.5);
gamma.assign(1.2);
beta.assign(-1.5);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
ASSERT_TRUE(expected.isSameShape(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}
////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, batchnorm_test9) {
NDArray input('c', {2,3,3,3,3}, nd4j::DataType::FLOAT32);
NDArray mean ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
NDArray variance('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
NDArray gamma ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
NDArray beta ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
NDArray expected('c', {2,3,3,3,3}, {-138.960175, -137.263138, -135.566101, -133.869064, -132.172028, -130.474976, -128.777954, -127.080902, -125.383865, -123.686829, -121.989784, -120.292747,
-118.595711, -116.898666, -115.201630, -113.504593, -111.807549, -110.110512, -108.413475, -106.716431, -105.019394, -103.322357, -101.625313, -99.928276,
-98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922, -82.957886, -81.260841, -79.563805, -77.866768,
-76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414, -60.896374, -59.199333, -57.502296, -55.805256,
-54.108215, -52.411179, -50.714138, -49.017097, -47.320061, -45.623020, -43.925980, -42.228943, -40.531902, -38.834862, -37.137825, -35.440784, -33.743744,
-32.046707, -30.349667, -28.652628, -26.955589, -25.258549, -23.561510, -21.864471, -20.167431, -18.470392, -16.773354, -15.076314, -13.379274, -11.682236,
-9.985196, -8.288157, -6.591118, -4.894078, -3.197039, -1.500000, 0.197039, 1.894078, 3.591118, 5.288157, 6.985196, 8.682236, 10.379274, 12.076314, 13.773354,
15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744, 32.440784, 34.137825, 35.834862, 37.531902, 39.228943,
40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333, 57.896374, 59.593414, 61.290451, 62.987488, 64.684532,
66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922, 83.351959, 85.049004, 86.746040, 88.443077, 90.140121,
91.837158, 93.534195, 95.231239, 96.928276, 98.625313, 100.322357, 102.019394, 103.716431, 105.413475, 107.110512, 108.807549, 110.504593, 112.201630, 113.898666,
115.595711, 117.292747, 118.989784, 120.686829, 122.383865, 124.080902, 125.777946, 127.474976, 129.172028, 130.869064, 132.566101, 134.263138}, nd4j::DataType::FLOAT32);
input.linspace(-80, 1);
mean.assign(1.);
variance.assign(0.5);
gamma.assign(1.2);
beta.assign(-1.5);
nd4j::ops::batchnorm op;
auto results = op.execute({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3,4});
ASSERT_EQ(ND4J_STATUS_OK, results->status());
auto output = results->at(0);
// output->printBuffer();
ASSERT_TRUE(expected.isSameShape(*output));
ASSERT_TRUE(expected.equalsTo(output));
delete results;
}

View File

@ -45,26 +45,26 @@ static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*>
TEST_F(MklDnnTests, helpers_includer) {
// we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
#ifdef HAVE_MKLDNN
nd4j::ops::platforms::PLATFORM_conv2d conv2d;
nd4j::ops::platforms::PLATFORM_conv2d_bp conv2d_bp;
nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;
nd4j::ops::platforms::PLATFORM_conv2d conv3d;
nd4j::ops::platforms::PLATFORM_conv2d_bp conv3d_bp;
nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv3d;
nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv3d_bp;
nd4j::ops::platforms::PLATFORM_avgpool2d avgpool2d;
nd4j::ops::platforms::PLATFORM_avgpool2d_bp avgpool2d_bp;
nd4j::ops::platforms::PLATFORM_avgpool2d_ENGINE_CPU avgpool2d;
nd4j::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CPU avgpool2d_bp;
nd4j::ops::platforms::PLATFORM_maxpool2d maxpool2d;
nd4j::ops::platforms::PLATFORM_maxpool2d_bp maxpool2d_bp;
nd4j::ops::platforms::PLATFORM_maxpool2d_ENGINE_CPU maxpool2d;
nd4j::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CPU maxpool2d_bp;
nd4j::ops::platforms::PLATFORM_avgpool3dnew avgpool3d;
nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp avgpool3d_bp;
nd4j::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CPU avgpool3d;
nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CPU avgpool3d_bp;
nd4j::ops::platforms::PLATFORM_maxpool3dnew maxpool3d;
nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp maxpool3d_bp;
nd4j::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CPU maxpool3d;
nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;
nd4j::ops::platforms::PLATFORM_lrn lrn;
nd4j::ops::platforms::PLATFORM_batchnorm batchnorm;
nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;
printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm});
#endif

View File

@ -247,8 +247,10 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) {
auto res = cudaStreamSynchronize(*stream);
ASSERT_EQ(0, res);
//double* localBuffer = ;
z.syncToHost();
cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost);
res = cudaStreamSynchronize(*stream);
z.tickWriteHost();
ASSERT_EQ(0, res);
//

View File

@ -150,7 +150,7 @@ if ("${EXPERIMENTAL}" STREQUAL "yes")
endif()
# tests are always compiled with all ops included
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true")
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DDEFAULT_ENGINE=samediff::ENGINE_CPU")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# using Clang

View File

@ -38,6 +38,7 @@ import org.bytedeco.javacpp.tools.InfoMapper;
"array/ConstantDataBuffer.h",
"array/TadPack.h",
"execution/ErrorReference.h",
"execution/Engine.h",
"memory/MemoryType.h",
"Environment.h",
"types/utf8string.h",

View File

@ -41,6 +41,7 @@ import java.util.Scanner;
"array/ConstantDescriptor.h",
"array/TadPack.h",
"execution/ErrorReference.h",
"execution/Engine.h",
"Environment.h",
"types/utf8string.h",
"NativeOps.h",