From 15f50eb69a9245b1773c358705398d4f07f57a42 Mon Sep 17 00:00:00 2001 From: agibsonccc Date: Mon, 29 Mar 2021 15:31:06 +0900 Subject: [PATCH] Disable parallel tests due to OOM on GPU --- .github/workflows/run-cpu-integration-tests-self-hosted.yml | 2 +- .github/workflows/run-gpu-integration-tests-self-hosted.yml | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-cpu-integration-tests-self-hosted.yml b/.github/workflows/run-cpu-integration-tests-self-hosted.yml index 5aeb1fd21..b18938ade 100644 --- a/.github/workflows/run-cpu-integration-tests-self-hosted.yml +++ b/.github/workflows/run-cpu-integration-tests-self-hosted.yml @@ -9,8 +9,8 @@ jobs: uses: styfle/cancel-workflow-action@0.8.0 with: access_token: ${{ github.token }} - - uses: ./.github/actions/download-dl4j-test-resources-linux - uses: actions/checkout@v2 + - uses: ./.github/actions/download-dl4j-test-resources-linux - name: Run cpu tests shell: bash env: diff --git a/.github/workflows/run-gpu-integration-tests-self-hosted.yml b/.github/workflows/run-gpu-integration-tests-self-hosted.yml index 609590205..caeb13de3 100644 --- a/.github/workflows/run-gpu-integration-tests-self-hosted.yml +++ b/.github/workflows/run-gpu-integration-tests-self-hosted.yml @@ -36,10 +36,11 @@ jobs: export OMP_NUM_THREADS=1 mkdir -p ${GITHUB_WORKSPACE}/resources mkdir -p ${GITHUB_WORKSPACE}/cache + export CUDA_VISIBLE_DEVICES=0 echo "Running tests for cuda 11.0" export PATH="/opt/protobuf/bin:/usr/local/cuda-11.2/bin:$PATH" mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.0,:deeplearning4j-cuda-11.0,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests - mvn -Dtest.offheap.size=6g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j + mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=6g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j #mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j echo "Running tests for cuda 11.2" ${GITHUB_WORKSPACE}/change-cuda-versions.sh 11.2 @@ -49,7 +50,7 @@ jobs: echo "Installing jars for 11.2" mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.2,:deeplearning4j-cuda-11.2,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests echo "Installed jars for 11.2, running smaller tests for cuda 11.2" - mvn -Dtest.offheap.size=4g -Dtest.heap.size=2g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j + mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=4g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j #echo "Running larger for cuda 11.2" #mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j