cavis/.github/workflows/run-gpu-integration-tests-s...

59 lines
4.0 KiB
YAML

on:
workflow_dispatch:
jobs:
# Wait for up to a minute for previous run to complete, abort if not done by then
pre-ci:
runs-on: self-hosted
timeout-minutes: 1
steps:
- name: 'Block Concurrent Executions'
uses: softprops/turnstyle@v1
with:
poll-interval-seconds: 10
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
linux-x86_64:
needs: pre-ci
runs-on: [self-hosted]
steps:
- uses: AutoModality/action-clean@v1
- name: Cancel Previous Runs
uses: styfle/cancel-workflow-action@0.8.0
with:
access_token: ${{ github.token }}
- uses: actions/checkout@v2
- uses: ./.github/actions/download-dl4j-test-resources-linux
- name: Run gpu tests
shell: bash
env:
DEBIAN_FRONTEND: noninteractive
run: |
export PATH="/opt/protobuf/bin:/usr/local/cuda-11/bin:$PATH"
nvcc --version
mvn --version
cmake --version
protoc --version
export OMP_NUM_THREADS=1
mkdir -p ${GITHUB_WORKSPACE}/resources
mkdir -p ${GITHUB_WORKSPACE}/cache
export CUDA_VISIBLE_DEVICES=0
echo "Running tests for cuda 11.0"
export PATH="/opt/protobuf/bin:/usr/local/cuda-11/bin:$PATH"
mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.0,:deeplearning4j-cuda-11.0,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests
echo "Running small tests for cuda 11.0"
mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=6g -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
echo "Running large tests for cuda 11.0"
mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j
#echo "Running tests for cuda 11.2"
#${GITHUB_WORKSPACE}/change-cuda-versions.sh 11.2
#echo "Changed cuda to 11.2"
#export PATH="/opt/protobuf/bin:/usr/local/cuda-11.2/bin:$PATH"
#echo "Updated path for 11.2"
#echo "Installing jars for 11.2"
#mvn -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda -pl ":nd4j-cuda-11.2,:deeplearning4j-cuda-11.2,:libnd4j" --also-make -Pcuda clean --batch-mode install -DskipTests
echo "Installed jars for 11.2, running smaller tests for cuda 11.2"
#mvn -Djunit.jupiter.execution.parallel.enabled=false -Dtest.offheap.size=4g -Pcuda -Dlibnd4j.chip=cuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -DexcludedGroups="long-running-tests, large-resources, distributed-systems" -DskipTestResourceEnforcement=true -Ptestresources -Pintegration-tests -Pnd4j-tests-cuda clean test --fail-never -rf :nd4j
#echo "Running larger for cuda 11.2"
#mvn -Pcuda -Dlibnd4j.chip=cuda -Dorg.nd4j.strumpf.resource.dirs=${GITHUB_WORKSPACE}/resources -Dorg.nd4j.test.resources.cache.dir=${GITHUB_WORKSPACE}/cache -Dgroups="long-running-tests, large-resources, distributed-systems" -Ptestresources -Pnd4j-tests-cuda -Dtest.offheap.size=14g -Dtest.heap.size=6g -Dsurefire.parallel.forcedTimeout=200 -Dsurefire.parallel.timeout=200 -Dsurefire.timeout=200 -Dsurefire.exitTimeout=200 test --fail-never -rf :nd4j