GPU test fixes

master
agibsonccc 2021-03-29 13:34:03 +09:00
parent 6b681187e5
commit 2ef0d796e0
9 changed files with 159 additions and 9 deletions

View File

@ -39,8 +39,8 @@ jobs:
echo "Running tests for cuda 11.0"
export PATH="/opt/protobuf/bin:/usr/local/cuda-11.2/bin:$PATH"
mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.0,:deeplearning4j-cuda-11.0,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests
mvn -Dtest.offheap.size=2g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j
mvn -Dtest.offheap.size=6g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
#mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j
echo "Running tests for cuda 11.2"
${GITHUB_WORKSPACE}/change-cuda-versions.sh 11.2
echo "Changed cuda to 11.2"
@ -49,7 +49,7 @@ jobs:
echo "Installing jars for 11.2"
mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.2,:deeplearning4j-cuda-11.2,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests
echo "Installed jars for 11.2, running smaller tests for cuda 11.2"
mvn -Dtest.offheap.size=2g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
echo "Running larger for cuda 11.2"
mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j
mvn -Dtest.offheap.size=4g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
#echo "Running larger for cuda 11.2"
#mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j

View File

@ -56,8 +56,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -117,8 +117,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -74,8 +74,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -310,8 +310,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -127,8 +127,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -20,8 +20,8 @@
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
@ -57,6 +57,28 @@
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>${maven-surefire-plugin.version}</version>
<inherited>true</inherited>
<!-- GPU tests fail automatically on multi gpu-->
<configuration>
<environmentVariables>
<CUDA_VISIBLE_DEVICES>0</CUDA_VISIBLE_DEVICES>
</environmentVariables>
<parallelMavenExecution>
false
</parallelMavenExecution>
<trimStackTrace>false</trimStackTrace>
<useFile>false</useFile>
<reuseForks>false</reuseForks>
<forkCount>1</forkCount>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>

View File

@ -193,8 +193,26 @@
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<!-- For running unit tests with nd4j-cuda-8.0: "mvn clean test -P test-nd4j-cuda-8.0" -->
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -20,8 +20,8 @@
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
@ -49,11 +49,31 @@
</modules>
</profile>
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<profile>
<id>nd4j-tests-cpu</id>
</profile>
<profile>
<id>nd4j-tests-cuda</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-test-resources</artifactId>
<version>${dl4j-test-resources.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-cuda-11.0</artifactId>
<version>${nd4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>