From 2ef0d796e0c71b1dabb01725e019d39d9d0e26f6 Mon Sep 17 00:00:00 2001 From: agibsonccc Date: Mon, 29 Mar 2021 13:34:03 +0900 Subject: [PATCH] GPU test fixes --- .../run-gpu-integration-tests-self-hosted.yml | 10 +++---- deeplearning4j/deeplearning4j-data/pom.xml | 18 +++++++++++++ .../deeplearning4j-dataimport-solrj/pom.xml | 18 +++++++++++++ deeplearning4j/deeplearning4j-graph/pom.xml | 18 +++++++++++++ .../deeplearning4j-modelexport-solr/pom.xml | 18 +++++++++++++ .../deeplearning4j-modelimport/pom.xml | 18 +++++++++++++ .../deeplearning4j-nlp-parent/pom.xml | 26 +++++++++++++++++-- .../deeplearning4j-scaleout/spark/pom.xml | 18 +++++++++++++ .../deeplearning4j-ui-parent/pom.xml | 24 +++++++++++++++-- 9 files changed, 159 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run-gpu-integration-tests-self-hosted.yml b/.github/workflows/run-gpu-integration-tests-self-hosted.yml index 98bde7f7f..4d8987e70 100644 --- a/.github/workflows/run-gpu-integration-tests-self-hosted.yml +++ b/.github/workflows/run-gpu-integration-tests-self-hosted.yml @@ -39,8 +39,8 @@ jobs: echo "Running tests for cuda 11.0" export PATH="/opt/protobuf/bin:/usr/local/cuda-11.2/bin:$PATH" mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.0,:deeplearning4j-cuda-11.0,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests - mvn -Dtest.offheap.size=2g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j - mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j + mvn -Dtest.offheap.size=6g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j + #mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j echo "Running tests for cuda 11.2" ${GITHUB_WORKSPACE}/change-cuda-versions.sh 11.2 echo "Changed cuda to 11.2" @@ -49,7 +49,7 @@ jobs: echo "Installing jars for 11.2" mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.2,:deeplearning4j-cuda-11.2,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests echo "Installed jars for 11.2, running smaller tests for cuda 11.2" - mvn -Dtest.offheap.size=2g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j - echo "Running larger for cuda 11.2" - mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j + mvn -Dtest.offheap.size=4g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j + #echo "Running larger for cuda 11.2" + #mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j diff --git a/deeplearning4j/deeplearning4j-data/pom.xml b/deeplearning4j/deeplearning4j-data/pom.xml index 5f047041b..d7033d3ad 100644 --- a/deeplearning4j/deeplearning4j-data/pom.xml +++ b/deeplearning4j/deeplearning4j-data/pom.xml @@ -56,8 +56,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml b/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml index 2abb61c15..899a81a17 100644 --- a/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml +++ b/deeplearning4j/deeplearning4j-dataimport-solrj/pom.xml @@ -117,8 +117,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-graph/pom.xml b/deeplearning4j/deeplearning4j-graph/pom.xml index 8ae897976..8b14d9916 100644 --- a/deeplearning4j/deeplearning4j-graph/pom.xml +++ b/deeplearning4j/deeplearning4j-graph/pom.xml @@ -74,8 +74,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml b/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml index 98052c3d7..a4fe90b4f 100644 --- a/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml +++ b/deeplearning4j/deeplearning4j-modelexport-solr/pom.xml @@ -310,8 +310,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-modelimport/pom.xml b/deeplearning4j/deeplearning4j-modelimport/pom.xml index 787f0ddf1..3fe4fb39d 100644 --- a/deeplearning4j/deeplearning4j-modelimport/pom.xml +++ b/deeplearning4j/deeplearning4j-modelimport/pom.xml @@ -127,8 +127,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-nlp-parent/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/pom.xml index 719ffa98d..dd4ba28f9 100644 --- a/deeplearning4j/deeplearning4j-nlp-parent/pom.xml +++ b/deeplearning4j/deeplearning4j-nlp-parent/pom.xml @@ -20,8 +20,8 @@ --> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 @@ -57,6 +57,28 @@ false + + + + maven-surefire-plugin + ${maven-surefire-plugin.version} + true + + + + 0 + + + false + + false + false + false + 1 + + + + org.deeplearning4j diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml index e73ed6e6c..0ad28bf14 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml +++ b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml @@ -193,8 +193,26 @@ nd4j-tests-cpu + nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + + diff --git a/deeplearning4j/deeplearning4j-ui-parent/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/pom.xml index 6c42c0645..4d0daf7f7 100644 --- a/deeplearning4j/deeplearning4j-ui-parent/pom.xml +++ b/deeplearning4j/deeplearning4j-ui-parent/pom.xml @@ -20,8 +20,8 @@ --> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 @@ -49,11 +49,31 @@ + + nd4j-tests-cpu + nd4j-tests-cpu nd4j-tests-cuda + + false + + + + org.deeplearning4j + dl4j-test-resources + ${dl4j-test-resources.version} + test + + + org.nd4j + nd4j-cuda-11.0 + ${nd4j.version} + test + +