Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/docker-devito.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ jobs:
build-args: base=devitocodes/${{ matrix.base }}

- name: Remove dangling layers
if: ${{ !contains(matrix.runner, 'nvidiagpu') }}
run: docker system prune -f

- name: Run tests
Expand Down
131 changes: 81 additions & 50 deletions .github/workflows/pytest-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@ concurrency:
cancel-in-progress: true

env:
OUTPUT_PATH: ${{ github.workspace }}
RESOURCE_GROUP: CI-gpu

on:
push:
branches: [ main ]
branches: [main]
pull_request:
branches: [ main ]
branches: [main]
workflow_dispatch:
inputs:
tags:
description: 'Run GPU tests'
description: "Run GPU tests"

jobs:
build:
Expand All @@ -35,66 +34,63 @@ jobs:
- self-hosted
- ${{ matrix.runner_label }}

# Job-level env (includes per-runner image/container tags)
env:
DEVITO_ARCH: ${{ matrix.arch }}
DEVITO_PLATFORM: ${{ matrix.platform }}
DEVITO_LANGUAGE: ${{ matrix.language }}
OMPI_CC: ${{ matrix.arch }}

strategy:
fail-fast: false
matrix:
name: [
pytest-gpu-acc-nvidia,
pytest-gpu-omp-amd
]
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]

include:
# -------------------- NVIDIA job --------------------
- name: pytest-gpu-acc-nvidia
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
base: "devitocodes/bases:nvidia-nvc"
runner_label: nvidiagpu
test_drive_cmd: "nvidia-smi"
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
flags: >-
--init --rm -t
--name ${CONTAINER_BASENAME}
--env CUDA_VISIBLE_DEVICES
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"

# -------------------- AMD job -----------------------
- name: pytest-gpu-omp-amd
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
runner_label: amdgpu
base: "devitocodes/bases:amd"
test_drive_cmd: "rocm-smi"
# Unchanged, still passes through required /dev nodes etc.
flags: >-
--init --network=host
--device=/dev/kfd --device=/dev/dri
--ipc=host
--group-add video --group-add $(getent group render | cut -d: -f3)
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
--rm -t
--name ${CONTAINER_BASENAME}
# -------------------- NVIDIA job --------------------
- name: pytest-gpu-acc-nvidia
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
base: "devitocodes/bases:nvidia-nvc"
runner_label: nvidiagpu
test_drive_cmd: "nvidia-smi"
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
flags: >-
--init --rm -t
--name ${CONTAINER_BASENAME}
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"

# -------------------- AMD job -----------------------
- name: pytest-gpu-omp-amd
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
runner_label: amdgpu
base: "devitocodes/bases:amd"
test_drive_cmd: "rocm-smi"
# Unchanged, still passes through required /dev nodes etc.
flags: >-
--init --network=host
--device=/dev/kfd --device=/dev/dri
--ipc=host
--group-add video --group-add "$(getent group render | cut -d: -f3)"
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
--rm -t
--name ${CONTAINER_BASENAME}

steps:
- name: Checkout devito
uses: actions/checkout@v4

- name: Set per-runner tags
run: |
echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV

- name: Ensure buildx builder
run: |
docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
docker buildx use "${RUNNER_NAME// /_}"

- name: Build docker image
run: |
docker build . \
docker buildx build . \
--builder "${RUNNER_NAME// /_}" \
--load \
--label ci-run=$GITHUB_RUN_ID \
--rm --pull \
--file docker/Dockerfile.devito \
--tag "${DOCKER_IMAGE}" \
Expand All @@ -105,16 +101,37 @@ jobs:

- name: Probe gpu
run: |
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
# runners; fall back to "all" so the driver probe does not fail.
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
fi

# Run a simple driver-probe command (nvidia-smi / rocm-smi)
docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}

- name: Test with pytest
env:
# Exported earlier in the job; needed inside the container for codecov
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
run: |
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
# Add Codecov’s environment variables (GITHUB_SHA, etc.)
ci_env=$(bash <(curl -s https://codecov.io/env))

docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
# Run the test suite using the matrix-defined flags
docker run ${{ matrix.flags }} \
${ci_env} \
-e CI=true \
-e PYTHONFAULTHANDLER=1 \
-e DEVITO_LOGGING=DEBUG \
-e CODECOV_TOKEN \
"${DOCKER_IMAGE}" \
pytest -vvv --capture=no --showlocals \
--log-cli-level=DEBUG -o log_cli=true \
--full-trace --durations=10 \
--cov --cov-config=.coveragerc --cov-report=xml \
${{ matrix.test_files }}

- name: Test examples
run: |
Expand All @@ -124,3 +141,17 @@ jobs:
run: |
docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
mpiexec -n 2 pytest ${{ matrix.test_examples }}

- name: Builder & image cleanup (keep 3 days of cache)
if: always()
run: |
# Remove only the test image we built
docker rmi -f "${DOCKER_IMAGE}" || true

# Classic image layers created in this job
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID

# BuildKit cache: target the per-runner builder explicitly
docker builder prune --builder "${RUNNER_NAME// /_}" \
-f \
--filter "until=72h"
Loading