[WIP] CUDA build (#109)

* working prototype of new CUDA build with cmake

Signed-off-by: raver119 <raver119@gmail.com>

* get rid of older stuff

Signed-off-by: raver119 <raver119@gmail.com>

* remove legacy CUDA debug section

Signed-off-by: raver119 <raver119@gmail.com>

* fPIC for GCC

Signed-off-by: raver119 <raver119@gmail.com>

* - switch to /MD
- make MSVC runtime lib configurable from 1 place

Signed-off-by: raver119 <raver119@gmail.com>

* few last tweaks

Signed-off-by: raver119 <raver119@gmail.com>

* mae static library optional

Signed-off-by: raver119 <raver119@gmail.com>

* typo fixed

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-12-14 14:38:17 +03:00 committed by GitHub
parent f78a638c9a
commit fdda0221ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 105 additions and 141 deletions

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.6) cmake_minimum_required(VERSION 3.15)
project(libnd4j) project(libnd4j)
set(CMAKE_VERBOSE_MAKEFILE OFF) set(CMAKE_VERBOSE_MAKEFILE OFF)
option(NATIVE "Optimize for build machine (might not work on others)" OFF) option(NATIVE "Optimize for build machine (might not work on others)" OFF)
@ -7,6 +7,17 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
option(BUILD_TESTS "Build tests" OFF) option(BUILD_TESTS "Build tests" OFF)
option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF)
set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE)
set(CMAKE_CXX_STANDARD 11)
if (CUDA_BLAS)
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 11)
endif()
# MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
set(MSVC_RT_LIB "MultiThreadedDLL")
set(X86_BUILD false) set(X86_BUILD false)
@ -17,23 +28,23 @@ endif()
# -fsanitize=address # -fsanitize=address
# -fsanitize=leak # -fsanitize=leak
if (ANDROID_BUILD) if (ANDROID_BUILD)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
elseif (APPLE) elseif (APPLE)
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true") set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
elseif(WIN32) elseif(WIN32)
set(X86_BUILD true) set(X86_BUILD true)
if (CUDA_BLAS) if (CUDA_BLAS)
set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true") set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc") set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc")
else() else()
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -std=c++11 -fmax-errors=2") set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -fmax-errors=2")
endif() endif()
else() else()
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -std=c++11 -fmax-errors=2") set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -fmax-errors=2")
if (CPU_BLAS) if (CPU_BLAS)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")

View File

@ -9,6 +9,7 @@ ExternalProject_Add(flatbuffers
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src" SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-build" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-build"
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
CMAKE_ARGS "-DFLATBUFFERS_BUILD_FLATC=OFF"
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
TEST_COMMAND "" TEST_COMMAND ""

View File

@ -136,116 +136,61 @@ if(CUDA_BLAS)
add_definitions(-D__CUDABLAS__=true) add_definitions(-D__CUDABLAS__=true)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
set (CMAKE_CXX_FLAGS "") set (CMAKE_CXX_FLAGS "")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER 4.9 AND "$ENV{TRICK_NVCC}" STREQUAL "YES" AND CUDA_VERSION VERSION_LESS "8.0")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GNUC__=4 -D__GNUC_MINOR__=9 -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED")
set (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -std=c++11 -Dnullptr=NULL")
message("TRICKING CUDA INTO SUPPORTING GCC > 4.9 YOU ARE PROCEEDING AT YOUR OWN RISK")
endif()
endif()
# we want OpenMP to be available for hybrid operations, at least for GCC
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
endif() endif()
if (CUDA_FOUND) if (CUDA_FOUND)
message("CUDA include directory: ${CUDA_INCLUDE_DIRS}") message("CUDA include directory: ${CUDA_INCLUDE_DIRS}")
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
message("CUDA found!") message("CUDA found!")
set( CUDA_ARCHITECTURE_MINIMUM "3.0" CACHE STRING "Minimum required CUDA compute capability" )
SET(CUDA_VERBOSE_BUILD OFF)
SET(CUDA_SEPARABLE_COMPILATION OFF)
#set(CUDA_COMPUTE_CAPABILITY "61")
set(CUDA_COMPUTE_CAPABILITY "35")
# make NVCC more verbose to prevent timeouts on CI servers
#list(APPEND CUDA_NVCC_FLAGS -v)
if ("${EXPERIMENTAL}" STREQUAL "yes") if ("${EXPERIMENTAL}" STREQUAL "yes")
message("Experimental mode ENABLED") message("Experimental mode ENABLED")
list(APPEND CUDA_NVCC_FLAGS -D__ND4J_EXPERIMENTAL__=true) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
set (EXPM " -D__ND4J_EXPERIMENTAL__=true") set(EXPM " -D__ND4J_EXPERIMENTAL__=true")
endif() endif()
if (CMAKE_BUILD_TYPE STREQUAL "Release")
# the only difference for debug mode here is host/device debug symbols
set(CMAKE_CUDA_FLAGS_DEBUG " -G -g")
# we need -fPIC on Linux/GCC
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
message("Enabling fPIC...")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC")
endif()
if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 10 if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 10
if ("${COMPUTE}" STREQUAL "all") if ("${COMPUTE}" STREQUAL "all")
if (APPLE) if (APPLE)
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
endif() endif()
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
endif() endif()
elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9 elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9
if ("${COMPUTE}" STREQUAL "all") if ("${COMPUTE}" STREQUAL "all")
if (APPLE) if (APPLE)
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
endif() endif()
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
endif() endif()
elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8.0 elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8.0
if ("${COMPUTE}" STREQUAL "all") if ("${COMPUTE}" STREQUAL "all")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda --Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
endif() endif()
else() else()
if ("${COMPUTE}" STREQUAL "all") if ("${COMPUTE}" STREQUAL "all")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_52,code=sm_52 ) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_75 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_52,code=sm_52")
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_75 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
endif()
endif()
else()
# debug only
if (LINUX)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--export-dynamic -rdynamic")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --export-dynamic")
endif()
if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 9
message("CUDA 10 Debug build")
if ("${COMPUTE}" STREQUAL "all")
if (APPLE)
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
elseif()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70)
endif()
else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
endif()
elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9
if ("${COMPUTE}" STREQUAL "all")
if (APPLE)
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
elseif()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70)
endif()
else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
endif()
elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8
if ("${COMPUTE}" STREQUAL "all")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
endif()
else()
if ("${COMPUTE}" STREQUAL "all")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53)
else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
endif()
endif() endif()
endif() endif()
@ -264,30 +209,37 @@ if(CUDA_BLAS)
file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h) file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu) file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)
add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
CUDA_ADD_LIBRARY(${LIBND4J_NAME} SHARED cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES} Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES}) ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
message("MSVC runtime for library: ${MSVC_RT_LIB}")
# static library is built only if we're going to build tests, skip otherwise
if (BUILD_TESTS)
add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
set_property(TARGET ${LIBND4J_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
install(TARGETS ${LIBND4J_NAME}static DESTINATION .)
endif()
# on windows we want to make sure we use MT or MD, but since we use it in one lib, we must use it everywhere to avoid conflicts
set_property(TARGET nd4jobj PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
set_property(TARGET ${LIBND4J_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
if(WIN32) if(WIN32)
message("CUDA on Windows: enabling /EHsc") message("CUDA on Windows: enabling /EHsc")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
endif() endif()
target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY}) target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)
install(TARGETS ${LIBND4J_NAME} DESTINATION .) install(TARGETS ${LIBND4J_NAME} DESTINATION .)
add_custom_command(
TARGET ${LIBND4J_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:${LIBND4J_NAME}>
${PROJECT_BINARY_DIR}/../../tests_cpu/)
endif(CUDA_FOUND) endif(CUDA_FOUND)
elseif(CPU_BLAS) elseif(CPU_BLAS)
@ -334,15 +286,13 @@ elseif(CPU_BLAS)
if(IOS) if(IOS)
add_library(${LIBND4J_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>) add_library(${LIBND4J_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>)
else() else()
# static library is built only if we're going to build tests, skip otherwise
if (BUILD_TESTS)
add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>) add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
endif() endif()
#if(WIN32) add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
# message("CPU on Windows: enabling /EHsc") endif()
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
# SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
#endif()
# we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
if (NOT BLAS_LIBRARIES) if (NOT BLAS_LIBRARIES)
@ -374,7 +324,6 @@ elseif(CPU_BLAS)
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic") SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
endif() endif()
#install(TARGETS mySharedLib DESTINATION /some/full/path)
install(TARGETS ${LIBND4J_NAME} DESTINATION .) install(TARGETS ${LIBND4J_NAME} DESTINATION .)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
endif() endif()

View File

@ -2103,7 +2103,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) {
template <typename T> template <typename T>
T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLong w) { T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLong w) {
if (rankOf() != 4 || i >= sizeAt(0) || j >= sizeAt(1) || k >= sizeAt(2), w >= sizeAt(3)) if (rankOf() != 4 || i >= sizeAt(0) || j >= sizeAt(1) || k >= sizeAt(2) || w >= sizeAt(3))
throw std::invalid_argument("NDArray::t(i,j,k,w): one of input indexes is out of array length or rank!=4 !"); throw std::invalid_argument("NDArray::t(i,j,k,w): one of input indexes is out of array length or rank!=4 !");
if (DataTypeUtils::fromT<T>() != _dataType) if (DataTypeUtils::fromT<T>() != _dataType)
throw std::invalid_argument("NDArray::t(i,j,k,w): type of array is not equal to template type T!"); throw std::invalid_argument("NDArray::t(i,j,k,w): type of array is not equal to template type T!");

View File

@ -1242,7 +1242,9 @@
#if defined(_MSC_VER) || defined(_WIN64) || defined(_WIN32) || defined(__CLION_IDE__) || defined(__VSCODE__) #if defined(_MSC_VER) || defined(_WIN64) || defined(_WIN32) || defined(__CLION_IDE__) || defined(__VSCODE__)
#define NOT_EXCLUDED(NAME) 1>0 #define NOT_EXCLUDED(NAME) 1>0
#else #else
#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME) // for now we don't want minifier mechanics working
//#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME)
#define NOT_EXCLUDED(NAME) 1>0
#endif #endif
#ifdef __JAVACPP_HACK__ #ifdef __JAVACPP_HACK__

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.6) cmake_minimum_required(VERSION 3.15)
project(tests_cpu) project(tests_cpu)
# Download and unpack googletest at configure time # Download and unpack googletest at configure time

View File

@ -5,9 +5,10 @@ project(googletest-download NONE)
include(ExternalProject) include(ExternalProject)
ExternalProject_Add(googletest ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.8.1 GIT_TAG release-1.10.0
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
CMAKE_ARGS ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""

View File

@ -30,31 +30,30 @@ if (CUDA_BLAS)
if(WIN32) if(WIN32)
message("CUDA on Windows: enabling /EHsc") message("CUDA on Windows: enabling /EHsc")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /FS") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /FS")
SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc")
endif() endif()
if ("${COMPUTE}" STREQUAL "all") if ("${COMPUTE}" STREQUAL "all")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70) set(CMAKE_CUDA_FLAGS " -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
else() else()
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) set(CMAKE_CUDA_FLAGS " -DCUDA_10 ${EXPM} -w -G -g --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
endif() endif()
endif() endif()
# -fsanitize=address # -fsanitize=address
# -fsanitize=leak # -fsanitize=leak
if (APPLE) if (APPLE)
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true") set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2 -D__APPLE_OS__=true")
elseif(WIN32) elseif(WIN32)
if (CPU_BLAS) if (CPU_BLAS)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
endif() endif()
if (CPU_BLAS AND LINUX) if (CPU_BLAS AND LINUX)
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2") set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
endif() endif()
else() else()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2") set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
else() else()
@ -68,14 +67,6 @@ else()
endif() endif()
endif() endif()
# TODO: get rid of this once problem confirmed solved
#if (APPLE)
# if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER 6.0 OR "${CMAKE_C_COMPILER_VERSION}" VERSION_EQUAL 6.0)
# SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mavx512f -fmax-errors=1")
# endif()
# endif()
#endif()
# tests are always compiled with all ops included # tests are always compiled with all ops included
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DBUILD_TESTS=true") SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DBUILD_TESTS=true")
@ -141,6 +132,15 @@ if (CPU_BLAS)
add_executable(runtests ${TEST_SOURCES}) add_executable(runtests ${TEST_SOURCES})
target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main) target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
elseif(CUDA_BLAS) elseif(CUDA_BLAS)
CUDA_ADD_EXECUTABLE(runtests ${TEST_SOURCES})
target_link_libraries(runtests ${LIBND4J_NAME} ${CUDA_LIBRARIES} gtest gtest_main) add_executable(runtests ${TEST_SOURCES})
message("MSVC runtime for tests: ${MSVC_RT_LIB}")
# applies to windows only
set_property(TARGET runtests PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
set_property(TARGET gtest PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
set_property(TARGET gtest_main PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} gtest gtest_main)
endif() endif()