on: workflow_dispatch: jobs: # Wait for up to a minute for previous run to complete, abort if not done by then pre-ci: runs-on: self-hosted timeout-minutes: 1 steps: - name: 'Block Concurrent Executions' uses: softprops/turnstyle@v1 with: poll-interval-seconds: 10 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} linux-x86_64: needs: pre-ci runs-on: [self-hosted] steps: - uses: AutoModality/action-clean@v1 - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.8.0 with: access_token: ${{ github.token }} - uses: actions/checkout@v2 - uses: ./.github/actions/download-dl4j-test-resources-linux - name: Run gpu tests shell: bash env: DEBIAN_FRONTEND: noninteractive run: | export PATH="/opt/protobuf/bin:/usr/local/cuda-11/bin:$PATH" nvcc --version mvn --version cmake --version protoc --version export OMP_NUM_THREADS=1 mkdir -p ${GITHUB_WORKSPACE}/resources mkdir -p ${GITHUB_WORKSPACE}/cache export CUDA_VISIBLE_DEVICES=0 echo "Running tests for cuda 11.0" export PATH="/opt/protobuf/bin:/usr/local/cuda-11/bin:$PATH" mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.0,:deeplearning4j-cuda-11.0,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests echo "Running small tests for cuda 11.0" mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=6g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j echo "Running large tests for cuda 11.0" mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j #echo "Running tests for cuda 11.2" #${GITHUB_WORKSPACE}/change-cuda-versions.sh 11.2 #echo "Changed cuda to 11.2" #export PATH="/opt/protobuf/bin:/usr/local/cuda-11.2/bin:$PATH" #echo "Updated path for 11.2" #echo "Installing jars for 11.2" #mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.2,:deeplearning4j-cuda-11.2,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests echo "Installed jars for 11.2, running smaller tests for cuda 11.2" #mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=4g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j #echo "Running larger for cuda 11.2" #mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j