From b1fa43a17b84ef811d831de3b47a832ae5cccf82 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Wed, 23 Jul 2025 15:35:38 +0100 Subject: [PATCH 1/5] ci: isolate GPU runners, respect CUDA_VISIBLE_DEVICES, drop global prunes * Pass CUDA_VISIBLE_DEVICES and use --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" for NVIDIA jobs * Tag images/containers with ${{ runner.name }} to avoid cross-runner collisions * Remove docker system prune on shared nvidiagpu hosts (keep cache, avoid races) * Add runner label to NVIDIA base builds for traceable cleanup * Minor YAML tidy/comments across workflows --- .github/workflows/docker-bases.yml | 116 +++++++++++++--------------- .github/workflows/docker-devito.yml | 15 +++- .github/workflows/pytest-gpu.yml | 112 ++++++++++++++++----------- 3 files changed, 132 insertions(+), 111 deletions(-) diff --git a/.github/workflows/docker-bases.yml b/.github/workflows/docker-bases.yml index 8a5f2cfd10..0d71e2ca9e 100644 --- a/.github/workflows/docker-bases.yml +++ b/.github/workflows/docker-bases.yml @@ -7,10 +7,10 @@ concurrency: on: push: paths: - - '/docker/Dockerfile.nvidia' - - '/docker/Dockerfile.cpu' - - '/docker/Dockerfile.amd' - - '/docker/Dockerfile.intel' + - "/docker/Dockerfile.nvidia" + - "/docker/Dockerfile.cpu" + - "/docker/Dockerfile.amd" + - "/docker/Dockerfile.intel" workflow_dispatch: inputs: cpu: @@ -27,15 +27,15 @@ on: default: false tags: - description: 'Build compiler bases' + description: "Build compiler bases" schedule: # Run once a month - cron: "0 0 1 * *" jobs: -####################################################### -############## Basic gcc CPU ########################## -####################################################### + ####################################################### + ############## Basic gcc CPU ########################## + ####################################################### deploy-cpu-bases: if: inputs.cpu name: "cpu-base" @@ -66,22 +66,18 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: cleanup - run: docker system prune -a -f - - name: GCC image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.cpu' + file: "./docker/Dockerfile.cpu" push: true - build-args: 'gcc=${{ matrix.gcc }}' - tags: 'devitocodes/bases:cpu-gcc${{ matrix.gcc }}' - + build-args: "gcc=${{ matrix.gcc }}" + tags: "devitocodes/bases:cpu-gcc${{ matrix.gcc }}" -####################################################### -############## Intel OneApi CPU ####################### -####################################################### + ####################################################### + ############## Intel OneApi CPU ####################### + ####################################################### deploy-oneapi-bases: if: inputs.intel name: "oneapi-base" @@ -107,43 +103,39 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - - name: cleanup - run: docker system prune -a -f - - name: ICX image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.intel' + file: "./docker/Dockerfile.intel" push: true - target: 'icx' - build-args: 'arch=icx' - tags: 'devitocodes/bases:cpu-icx' + target: "icx" + build-args: "arch=icx" + tags: "devitocodes/bases:cpu-icx" - name: SYCL CPU image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.intel' + file: "./docker/Dockerfile.intel" push: true - target: 'cpu-sycl' - build-args: 'arch=cpu-sycl' - tags: 'devitocodes/bases:cpu-sycl' + target: "cpu-sycl" + build-args: "arch=cpu-sycl" + tags: "devitocodes/bases:cpu-sycl" - name: SYCL GPU image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.intel' + file: "./docker/Dockerfile.intel" push: true - target: 'gpu-sycl' - build-args: 'arch=gpu-sycl' - tags: 'devitocodes/bases:gpu-sycl' + target: "gpu-sycl" + build-args: "arch=gpu-sycl" + tags: "devitocodes/bases:gpu-sycl" -####################################################### -################### Nvidia nvhpc ###################### -####################################################### + ####################################################### + ################### Nvidia nvhpc ###################### + ####################################################### deploy-nvidia-bases: if: inputs.nvidia name: "nvidia-bases" @@ -170,42 +162,43 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: cleanup - run: docker system prune -a -f - - name: NVC image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.nvidia' + file: "./docker/Dockerfile.nvidia" push: true - target: 'nvc' - build-args: 'arch=nvc' - tags: 'devitocodes/bases:nvidia-nvc' + target: "nvc" + build-args: "arch=nvc" + # Label (not tag) with runner name for traceability without changing image tags + labels: builder-runner=${{ runner.name }} + tags: "devitocodes/bases:nvidia-nvc" - name: NVCC image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.nvidia' + file: "./docker/Dockerfile.nvidia" push: true - target: 'nvcc' - build-args: 'arch=nvcc' - tags: 'devitocodes/bases:nvidia-nvcc' + target: "nvcc" + build-args: "arch=nvcc" + labels: builder-runner=${{ runner.name }} + tags: "devitocodes/bases:nvidia-nvcc" - name: NVC host image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.nvidia' + file: "./docker/Dockerfile.nvidia" push: true - target: 'nvc-host' - build-args: 'arch=nvc-host' - tags: 'devitocodes/bases:cpu-nvc' - -####################################################### -##################### AMD ############################# -####################################################### + target: "nvc-host" + build-args: "arch=nvc-host" + labels: builder-runner=${{ runner.name }} + tags: "devitocodes/bases:cpu-nvc" + + ####################################################### + ##################### AMD ############################# + ####################################################### deploy-amd-bases: if: inputs.amd name: "amd-base" @@ -232,16 +225,13 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: cleanup - run: docker system prune -a -f - - name: AMD image uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.amd' + file: "./docker/Dockerfile.amd" push: true - target: 'amdclang' + target: "amdclang" build-args: | ROCM_VERSION=5.5.1 UCX_BRANCH=v1.13.1 @@ -252,9 +242,9 @@ jobs: uses: docker/build-push-action@v6 with: context: . - file: './docker/Dockerfile.amd' + file: "./docker/Dockerfile.amd" push: true - target: 'hip' + target: "hip" build-args: | ROCM_VERSION=6.3.4 tags: devitocodes/bases:amd-hip diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 74f5ed942c..dd7a67b35a 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -13,6 +13,9 @@ jobs: env: # Use buildkit https://docs.docker.com/develop/develop-images/build_enhancements/ for better build DOCKER_BUILDKIT: "1" + # Unique container name to avoid clashes across concurrent self‑hosted runners + CONTAINER_NAME: testrun-${{ matrix.tag }}-${{ runner.name }} + strategy: fail-fast: false @@ -20,7 +23,12 @@ jobs: include: - base: 'bases:nvidia-nvc' tag: 'nvidia-nvc' - flag: '--init --gpus all' + # Respect CUDA_VISIBLE_DEVICES set by the runner and hard‑limit docker to that device. + # (--env without value forwards host var; --gpus maps only that device) + flag: | + --init + --env CUDA_VISIBLE_DEVICES + --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" test: 'tests/test_gpu_openacc.py tests/test_gpu_common.py' runner: ["self-hosted", "nvidiagpu"] @@ -102,8 +110,11 @@ jobs: build-args: base=devitocodes/${{ matrix.base }} - name: Remove dangling layers + if: ${{ !contains(matrix.runner, 'nvidiagpu') }} run: docker system prune -f - name: Run tests run: | - docker run ${{ matrix.flag }} --rm -t --name testrun 'devitocodes/devito:${{ matrix.tag }}-dev' pytest ${{ matrix.test }} + docker run ${{ matrix.flag }} --rm -t --name "${CONTAINER_NAME}" \ + devitocodes/devito:${{ matrix.tag }}-dev \ + pytest ${{ matrix.test }} diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index ab4946ba78..b56f50bec9 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -1,7 +1,12 @@ # Runner information: -# OpenACC on NVidia runs on `nvidiagpu` -# OpenMP on NVidia runs on `nvidiagpu` -# OpenMP on AMD runs on `amdgpu` +# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu` +# - OpenMP on AMD runs on runners labeled `amdgpu` +# +# Changes vs original: +# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…" +# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse +# * Remove docker prune / global container deletes (we assume disk space is fine) +# * Add comments throughout name: CI-gpu @@ -14,36 +19,33 @@ env: RESOURCE_GROUP: CI-gpu on: - # Trigger the workflow on push or pull request, - # but only for the main branch push: - branches: - - main + branches: [ main ] pull_request: - branches: - - main - # Push-button activation + branches: [ main ] workflow_dispatch: inputs: tags: description: 'Run GPU tests' jobs: - build: name: ${{ matrix.name }} runs-on: ${{ matrix.tags }} + # Job-level env (includes per-runner image/container tags) env: DEVITO_ARCH: ${{ matrix.arch }} DEVITO_PLATFORM: ${{ matrix.platform }} DEVITO_LANGUAGE: ${{ matrix.language }} OMPI_CC: ${{ matrix.arch }} + # Unique image tag per *runner*, so caches are reused on that runner but isolated across runners + DOCKER_IMAGE: ${{ matrix.name }}-${{ runner.name }} + # Base name for containers started in this job + CONTAINER_BASENAME: testrun-${{ matrix.name }}-${{ runner.name }} strategy: - # Prevent all builds from terminating if one fails fail-fast: false - matrix: name: [ pytest-gpu-acc-nvidia, @@ -52,48 +54,66 @@ jobs: test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"] include: + # -------------------- NVIDIA job -------------------- - name: pytest-gpu-acc-nvidia test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" base: "devitocodes/bases:nvidia-nvc" tags: ["self-hosted", "nvidiagpu"] test_drive_cmd: "nvidia-smi" - flags: '--init --gpus all --rm -t --name testrun-nvc' - + # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. + # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). + flags: >- + --init --rm -t + --name ${CONTAINER_BASENAME} + --env CUDA_VISIBLE_DEVICES + --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" + + # -------------------- AMD job ----------------------- - name: pytest-gpu-omp-amd test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" tags: ["self-hosted", "amdgpu"] base: "devitocodes/bases:amd" test_drive_cmd: "rocm-smi" - # Attach the AMD GPU devices `/dev` and add user to video and render (109 on wampa) group - # Options from https://rocmdocs.amd.com/en/latest/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.html - flags: "--init --network=host --device=/dev/kfd --device=/dev/dri --ipc=host --group-add video --group-add $(getent group render | cut -d: -f3) --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --rm -t --name testrun-amd" + # Unchanged, still passes through required /dev nodes etc. + flags: >- + --init --network=host + --device=/dev/kfd --device=/dev/dri + --ipc=host + --group-add video --group-add $(getent group render | cut -d: -f3) + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + --rm -t + --name ${CONTAINER_BASENAME} steps: - - name: Checkout devito - uses: actions/checkout@v4 - - - name: Build docker image - run: | - docker build . --rm --pull --file docker/Dockerfile.devito --tag ${{ matrix.name }}-${{ github.ref }} --build-arg base=${{ matrix.base }} - - - name: Remove dangling layers - run: | - docker system prune -f - echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV - - - name: Test with pytest - run: | - # Remove all existing containers for safety - docker ps -aq | xargs -r docker rm -f - # Run tests - ci_env=`bash <(curl -s https://codecov.io/env)` - docker run ${{ matrix.flags }} ${{ matrix.name }}-${{ github.ref }} ${{ matrix.test_drive_cmd }} - docker run ${{ matrix.flags }} $ci_env -e CI=true ${{ matrix.name }}-${{ github.ref }} pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} - - - name: Test examples - run: | - docker run ${{ matrix.flags }} ${{ matrix.name }}-${{ github.ref }} pytest ${{ matrix.test_examples }} - - - name: Test examples with MPI - run: | - docker run ${{ matrix.flags }} --env DEVITO_MPI=1 ${{ matrix.name }}-${{ github.ref }} mpiexec -n 2 pytest ${{ matrix.test_examples }} + - name: Checkout devito + uses: actions/checkout@v4 + + - name: Build docker image + run: | + docker build . \ + --rm --pull \ + --file docker/Dockerfile.devito \ + --tag "${DOCKER_IMAGE}" \ + --build-arg base="${{ matrix.base }}" + + - name: Export CODECOV token + run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV" + + - name: Test with pytest + run: | + # Run a simple driver cmd first (nvidia-smi / rocm-smi) + ci_env=$(bash <(curl -s https://codecov.io/env)) + + docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} + + docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \ + pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} + + - name: Test examples + run: | + docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" pytest ${{ matrix.test_examples }} + + - name: Test examples with MPI + run: | + docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \ + mpiexec -n 2 pytest ${{ matrix.test_examples }} From 8b8f17e32d0a50184a7dc6134b98cecd844ab122 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Wed, 23 Jul 2025 15:39:27 +0100 Subject: [PATCH 2/5] Update .github/workflows/docker-devito.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/docker-devito.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index dd7a67b35a..3cbc8a5f69 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -13,7 +13,7 @@ jobs: env: # Use buildkit https://docs.docker.com/develop/develop-images/build_enhancements/ for better build DOCKER_BUILDKIT: "1" - # Unique container name to avoid clashes across concurrent self‑hosted runners + # Unique container name to avoid clashes across concurrent self-hosted runners CONTAINER_NAME: testrun-${{ matrix.tag }}-${{ runner.name }} From 3b775719fd0a737806c939549abffb37bdfa8fc0 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Wed, 23 Jul 2025 15:39:47 +0100 Subject: [PATCH 3/5] Update .github/workflows/docker-devito.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/docker-devito.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 3cbc8a5f69..ec5a751eb9 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -23,7 +23,7 @@ jobs: include: - base: 'bases:nvidia-nvc' tag: 'nvidia-nvc' - # Respect CUDA_VISIBLE_DEVICES set by the runner and hard‑limit docker to that device. + # Respect CUDA_VISIBLE_DEVICES set by the runner and hard-limit docker to that device. # (--env without value forwards host var; --gpus maps only that device) flag: | --init From bd54c08bb2cb9c389d16e8df87baa27018af6e77 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Wed, 23 Jul 2025 16:05:23 +0100 Subject: [PATCH 4/5] Fix formatting. --- .github/workflows/pytest-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index b56f50bec9..696a4e0d2c 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -31,7 +31,7 @@ on: jobs: build: name: ${{ matrix.name }} - runs-on: ${{ matrix.tags }} + runs-on: ${{ fromJSON('["' + join(matrix.tags, '","') + '"]') }} # Job-level env (includes per-runner image/container tags) env: From 7b86c57ca66afdfe7b45f702d143ebeab19743a8 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Wed, 23 Jul 2025 16:17:03 +0100 Subject: [PATCH 5/5] Fix formatting v2. --- .github/workflows/pytest-gpu.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index 696a4e0d2c..2519fb287c 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -31,7 +31,9 @@ on: jobs: build: name: ${{ matrix.name }} - runs-on: ${{ fromJSON('["' + join(matrix.tags, '","') + '"]') }} + runs-on: + - self-hosted + - ${{ matrix.runner_label }} # Job-level env (includes per-runner image/container tags) env: @@ -39,10 +41,6 @@ jobs: DEVITO_PLATFORM: ${{ matrix.platform }} DEVITO_LANGUAGE: ${{ matrix.language }} OMPI_CC: ${{ matrix.arch }} - # Unique image tag per *runner*, so caches are reused on that runner but isolated across runners - DOCKER_IMAGE: ${{ matrix.name }}-${{ runner.name }} - # Base name for containers started in this job - CONTAINER_BASENAME: testrun-${{ matrix.name }}-${{ runner.name }} strategy: fail-fast: false @@ -58,7 +56,7 @@ jobs: - name: pytest-gpu-acc-nvidia test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" base: "devitocodes/bases:nvidia-nvc" - tags: ["self-hosted", "nvidiagpu"] + runner_label: nvidiagpu test_drive_cmd: "nvidia-smi" # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). @@ -71,7 +69,7 @@ jobs: # -------------------- AMD job ----------------------- - name: pytest-gpu-omp-amd test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" - tags: ["self-hosted", "amdgpu"] + runner_label: amdgpu base: "devitocodes/bases:amd" test_drive_cmd: "rocm-smi" # Unchanged, still passes through required /dev nodes etc. @@ -88,6 +86,12 @@ jobs: - name: Checkout devito uses: actions/checkout@v4 + - name: Set per-runner tags + run: | + echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV + echo "DOCKER_IMAGE=${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV + echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV + - name: Build docker image run: | docker build . \