[WIP] CUDA build (#109)
* working prototype of new CUDA build with cmake Signed-off-by: raver119 <raver119@gmail.com> * get rid of older stuff Signed-off-by: raver119 <raver119@gmail.com> * remove legacy CUDA debug section Signed-off-by: raver119 <raver119@gmail.com> * fPIC for GCC Signed-off-by: raver119 <raver119@gmail.com> * - switch to /MD - make MSVC runtime lib configurable from 1 place Signed-off-by: raver119 <raver119@gmail.com> * few last tweaks Signed-off-by: raver119 <raver119@gmail.com> * mae static library optional Signed-off-by: raver119 <raver119@gmail.com> * typo fixed Signed-off-by: raver119 <raver119@gmail.com>master
parent
f78a638c9a
commit
fdda0221ed
|
@ -1,4 +1,4 @@
|
|||
cmake_minimum_required(VERSION 3.6)
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(libnd4j)
|
||||
set(CMAKE_VERBOSE_MAKEFILE OFF)
|
||||
option(NATIVE "Optimize for build machine (might not work on others)" OFF)
|
||||
|
@ -7,6 +7,17 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
|
|||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
|
||||
|
||||
option(BUILD_TESTS "Build tests" OFF)
|
||||
option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF)
|
||||
set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
if (CUDA_BLAS)
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
endif()
|
||||
|
||||
# MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
|
||||
set(MSVC_RT_LIB "MultiThreadedDLL")
|
||||
|
||||
set(X86_BUILD false)
|
||||
|
||||
|
@ -17,23 +28,23 @@ endif()
|
|||
# -fsanitize=address
|
||||
# -fsanitize=leak
|
||||
if (ANDROID_BUILD)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
|
||||
elseif (APPLE)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
|
||||
elseif(WIN32)
|
||||
set(X86_BUILD true)
|
||||
if (CUDA_BLAS)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -std=c++11 -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -fmax-errors=2")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -std=c++11 -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -fmax-errors=2")
|
||||
|
||||
if (CPU_BLAS)
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
|
||||
|
|
|
@ -9,6 +9,7 @@ ExternalProject_Add(flatbuffers
|
|||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-build"
|
||||
CONFIGURE_COMMAND ""
|
||||
CMAKE_ARGS "-DFLATBUFFERS_BUILD_FLATC=OFF"
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
|
|
|
@ -136,116 +136,61 @@ if(CUDA_BLAS)
|
|||
add_definitions(-D__CUDABLAS__=true)
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||
set (CMAKE_CXX_FLAGS "")
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER 4.9 AND "$ENV{TRICK_NVCC}" STREQUAL "YES" AND CUDA_VERSION VERSION_LESS "8.0")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GNUC__=4 -D__GNUC_MINOR__=9 -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED")
|
||||
set (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -std=c++11 -Dnullptr=NULL")
|
||||
message("TRICKING CUDA INTO SUPPORTING GCC > 4.9 YOU ARE PROCEEDING AT YOUR OWN RISK")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# we want OpenMP to be available for hybrid operations, at least for GCC
|
||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
find_package(OpenMP)
|
||||
if (OPENMP_FOUND)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (CUDA_FOUND)
|
||||
message("CUDA include directory: ${CUDA_INCLUDE_DIRS}")
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
message("CUDA found!")
|
||||
set( CUDA_ARCHITECTURE_MINIMUM "3.0" CACHE STRING "Minimum required CUDA compute capability" )
|
||||
SET(CUDA_VERBOSE_BUILD OFF)
|
||||
SET(CUDA_SEPARABLE_COMPILATION OFF)
|
||||
#set(CUDA_COMPUTE_CAPABILITY "61")
|
||||
set(CUDA_COMPUTE_CAPABILITY "35")
|
||||
# make NVCC more verbose to prevent timeouts on CI servers
|
||||
#list(APPEND CUDA_NVCC_FLAGS -v)
|
||||
|
||||
if ("${EXPERIMENTAL}" STREQUAL "yes")
|
||||
message("Experimental mode ENABLED")
|
||||
list(APPEND CUDA_NVCC_FLAGS -D__ND4J_EXPERIMENTAL__=true)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
|
||||
set (EXPM " -D__ND4J_EXPERIMENTAL__=true")
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
|
||||
set(EXPM " -D__ND4J_EXPERIMENTAL__=true")
|
||||
endif()
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 10
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60)
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
|
||||
endif()
|
||||
|
||||
# the only difference for debug mode here is host/device debug symbols
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG " -G -g")
|
||||
|
||||
# we need -fPIC on Linux/GCC
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
message("Enabling fPIC...")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC")
|
||||
endif()
|
||||
|
||||
if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 10
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60)
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
|
||||
endif()
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8.0
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60)
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
|
||||
endif()
|
||||
else()
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_52,code=sm_52 )
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} --cudart=static --expt-extended-lambda -O3 -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_10 ${EXPM} -w --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
|
||||
endif()
|
||||
elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
|
||||
else()
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_9 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
|
||||
endif()
|
||||
elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8.0
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60")
|
||||
else()
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_8 ${EXPM} -w --cudart=static --expt-extended-lambda --Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
# debug only
|
||||
if (LINUX)
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--export-dynamic -rdynamic")
|
||||
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --export-dynamic")
|
||||
endif()
|
||||
|
||||
if(CUDA_VERSION VERSION_GREATER "9.2") # cuda 9
|
||||
message("CUDA 10 Debug build")
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
|
||||
elseif()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70)
|
||||
endif()
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
elseif(CUDA_VERSION VERSION_GREATER "8.0") # cuda 9
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
if (APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
|
||||
elseif()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70)
|
||||
endif()
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_9 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
elseif (CUDA_VERSION VERSION_GREATER "7.5") # cuda 8
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62)
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_8 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_75 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_52,code=sm_52")
|
||||
else()
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_53,code=sm_53)
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_75 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
endif()
|
||||
set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -DCUDA_75 ${EXPM} -w --cudart=static --expt-extended-lambda -Xfatbin -compress-all -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -264,30 +209,37 @@ if(CUDA_BLAS)
|
|||
file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
|
||||
file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)
|
||||
|
||||
|
||||
CUDA_ADD_LIBRARY(${LIBND4J_NAME} SHARED cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
|
||||
add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
|
||||
${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
|
||||
../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
|
||||
cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
|
||||
Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
|
||||
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
|
||||
|
||||
add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
|
||||
|
||||
message("MSVC runtime for library: ${MSVC_RT_LIB}")
|
||||
|
||||
# static library is built only if we're going to build tests, skip otherwise
|
||||
if (BUILD_TESTS)
|
||||
add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
|
||||
set_property(TARGET ${LIBND4J_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
install(TARGETS ${LIBND4J_NAME}static DESTINATION .)
|
||||
endif()
|
||||
|
||||
# on windows we want to make sure we use MT or MD, but since we use it in one lib, we must use it everywhere to avoid conflicts
|
||||
set_property(TARGET nd4jobj PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
set_property(TARGET ${LIBND4J_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
|
||||
if(WIN32)
|
||||
message("CUDA on Windows: enabling /EHsc")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
|
||||
SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
|
||||
endif()
|
||||
|
||||
target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY})
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)
|
||||
|
||||
install(TARGETS ${LIBND4J_NAME} DESTINATION .)
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${LIBND4J_NAME} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:${LIBND4J_NAME}>
|
||||
${PROJECT_BINARY_DIR}/../../tests_cpu/)
|
||||
endif(CUDA_FOUND)
|
||||
elseif(CPU_BLAS)
|
||||
|
||||
|
@ -334,16 +286,14 @@ elseif(CPU_BLAS)
|
|||
if(IOS)
|
||||
add_library(${LIBND4J_NAME} STATIC $<TARGET_OBJECTS:nd4jobj>)
|
||||
else()
|
||||
add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
|
||||
# static library is built only if we're going to build tests, skip otherwise
|
||||
if (BUILD_TESTS)
|
||||
add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
|
||||
endif()
|
||||
|
||||
add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
|
||||
endif()
|
||||
|
||||
#if(WIN32)
|
||||
# message("CPU on Windows: enabling /EHsc")
|
||||
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
|
||||
# SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
|
||||
#endif()
|
||||
|
||||
# we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
|
||||
if (NOT BLAS_LIBRARIES)
|
||||
set(BLAS_LIBRARIES "")
|
||||
|
@ -374,7 +324,6 @@ elseif(CPU_BLAS)
|
|||
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
|
||||
endif()
|
||||
|
||||
#install(TARGETS mySharedLib DESTINATION /some/full/path)
|
||||
install(TARGETS ${LIBND4J_NAME} DESTINATION .)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
|
||||
endif()
|
||||
|
|
|
@ -2103,7 +2103,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) {
|
|||
template <typename T>
|
||||
T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLong w) {
|
||||
|
||||
if (rankOf() != 4 || i >= sizeAt(0) || j >= sizeAt(1) || k >= sizeAt(2), w >= sizeAt(3))
|
||||
if (rankOf() != 4 || i >= sizeAt(0) || j >= sizeAt(1) || k >= sizeAt(2) || w >= sizeAt(3))
|
||||
throw std::invalid_argument("NDArray::t(i,j,k,w): one of input indexes is out of array length or rank!=4 !");
|
||||
if (DataTypeUtils::fromT<T>() != _dataType)
|
||||
throw std::invalid_argument("NDArray::t(i,j,k,w): type of array is not equal to template type T!");
|
||||
|
|
|
@ -1242,7 +1242,9 @@
|
|||
#if defined(_MSC_VER) || defined(_WIN64) || defined(_WIN32) || defined(__CLION_IDE__) || defined(__VSCODE__)
|
||||
#define NOT_EXCLUDED(NAME) 1>0
|
||||
#else
|
||||
#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME)
|
||||
// for now we don't want minifier mechanics working
|
||||
//#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME)
|
||||
#define NOT_EXCLUDED(NAME) 1>0
|
||||
#endif
|
||||
|
||||
#ifdef __JAVACPP_HACK__
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
cmake_minimum_required(VERSION 3.6)
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(tests_cpu)
|
||||
|
||||
# Download and unpack googletest at configure time
|
||||
|
|
|
@ -5,9 +5,10 @@ project(googletest-download NONE)
|
|||
include(ExternalProject)
|
||||
ExternalProject_Add(googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG release-1.8.1
|
||||
GIT_TAG release-1.10.0
|
||||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
|
||||
CMAKE_ARGS ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
|
|
|
@ -30,31 +30,30 @@ if (CUDA_BLAS)
|
|||
if(WIN32)
|
||||
message("CUDA on Windows: enabling /EHsc")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /FS")
|
||||
SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc")
|
||||
endif()
|
||||
|
||||
if ("${COMPUTE}" STREQUAL "all")
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
|
||||
set(CMAKE_CUDA_FLAGS " -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
|
||||
set(CMAKE_CUDA_FLAGS " -DCUDA_10 ${EXPM} -w -G -g --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# -fsanitize=address
|
||||
# -fsanitize=leak
|
||||
if (APPLE)
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2 -D__APPLE_OS__=true")
|
||||
elseif(WIN32)
|
||||
if (CPU_BLAS)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
|
||||
endif()
|
||||
|
||||
if (CPU_BLAS AND LINUX)
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
|
||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
|
||||
else()
|
||||
|
@ -68,14 +67,6 @@ else()
|
|||
endif()
|
||||
endif()
|
||||
|
||||
# TODO: get rid of this once problem confirmed solved
|
||||
#if (APPLE)
|
||||
# if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
# if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER 6.0 OR "${CMAKE_C_COMPILER_VERSION}" VERSION_EQUAL 6.0)
|
||||
# SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mavx512f -fmax-errors=1")
|
||||
# endif()
|
||||
# endif()
|
||||
#endif()
|
||||
|
||||
# tests are always compiled with all ops included
|
||||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DBUILD_TESTS=true")
|
||||
|
@ -141,6 +132,15 @@ if (CPU_BLAS)
|
|||
add_executable(runtests ${TEST_SOURCES})
|
||||
target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
|
||||
elseif(CUDA_BLAS)
|
||||
CUDA_ADD_EXECUTABLE(runtests ${TEST_SOURCES})
|
||||
target_link_libraries(runtests ${LIBND4J_NAME} ${CUDA_LIBRARIES} gtest gtest_main)
|
||||
|
||||
add_executable(runtests ${TEST_SOURCES})
|
||||
|
||||
message("MSVC runtime for tests: ${MSVC_RT_LIB}")
|
||||
|
||||
# applies to windows only
|
||||
set_property(TARGET runtests PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
set_property(TARGET gtest PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
set_property(TARGET gtest_main PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
|
||||
|
||||
target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} gtest gtest_main)
|
||||
endif()
|
Loading…
Reference in New Issue