Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/_forge.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ steps:
python:
- "3.11"
cuda:
- "12.4.1-cudnn"
- "12.8.1-cudnn"
env:
PYTHON_VERSION: "{{matrix.python}}"
CUDA_VERSION: "{{matrix.cuda}}"
Expand Down
6 changes: 6 additions & 0 deletions .buildkite/base.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ steps:
env:
PYTHON: "{{matrix}}"

- name: oss-ci-base_cu128-multipy
label: "wanda: oss-ci-base_cu128-py3.11"
wanda: ci/docker/base.cu128.wanda.yaml
env:
PYTHON: "3.11"

- name: docbuild
label: "wanda: docbuild-py{{matrix}}"
wanda: ci/docker/doc.build.wanda.yaml
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/build.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ steps:
instance_type: medium
commands:
- bazel run //ci/ray_ci:build_in_docker -- docker --python-version {{matrix}}
--platform cu12.4.1-cudnn --image-type ray-llm --upload
--platform cu12.8.1-cudnn --image-type ray-llm --upload
depends_on:
- manylinux
- forge
Expand Down
6 changes: 3 additions & 3 deletions .buildkite/llm.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ steps:
- name: llmgpubuild
wanda: ci/docker/llm.build.wanda.yaml
depends_on:
- oss-ci-base_gpu-multipy
- oss-ci-base_cu128-multipy
env:
IMAGE_TO: "llmgpubuild"
IMAGE_FROM: "cr.ray.io/rayproject/oss-ci-base_gpu-py3.11"
RAY_CUDA_CODE: "cu121"
IMAGE_FROM: "cr.ray.io/rayproject/oss-ci-base_cu128-py3.11"
RAY_CUDA_CODE: "cu128"

- label: "llm cpu tests"
key: "llm-cpu-tests"
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/release/build.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ steps:
instance_type: release-medium
commands:
- bazel run //ci/ray_ci:build_in_docker -- anyscale --python-version {{matrix}}
--platform cu12.4.1-cudnn --image-type ray-llm --upload
--platform cu12.8.1-cudnn --image-type ray-llm --upload
depends_on:
- manylinux
- forge
Expand Down
3 changes: 1 addition & 2 deletions ci/compile_llm_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ if [[ "${PYTHON_CODE}" != "py311" ]]; then
exit 1
fi

for CUDA_CODE in cpu cu121 cu124 ; do
for CUDA_CODE in cpu cu121 cu128; do
PYTHON_CUDA_CODE="${PYTHON_CODE}_${CUDA_CODE}"

echo "--- Compile dependencies for ${PYTHON_CODE}_${CUDA_CODE}"
Expand All @@ -23,7 +23,6 @@ for CUDA_CODE in cpu cu121 cu124 ; do
--unsafe-package setuptools
--index-url "https://pypi.org/simple"
--extra-index-url "https://download.pytorch.org/whl/${CUDA_CODE}"
--find-links "https://data.pyg.org/whl/torch-2.5.1+${CUDA_CODE}.html"
--index-strategy unsafe-best-match
--no-strip-markers
--emit-index-url
Expand Down
17 changes: 17 additions & 0 deletions ci/docker/base.cu128.wanda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: "oss-ci-base_cu128-py$PYTHON"
froms: ["nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04"]
dockerfile: ci/docker/base.gpu.Dockerfile
srcs:
- .bazelrc
- .bazelversion
- ci/env/install-dependencies.sh
- ci/env/install-llvm-binaries.sh
- ci/env/install-bazel.sh
- ci/env/install-miniforge.sh
- ci/suppress_output
build_args:
- REMOTE_CACHE_URL=$BUILDKITE_BAZEL_CACHE_URL
- PYTHON
- BASE_IMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
tags:
- cr.ray.io/rayproject/oss-ci-base_cu128-py$PYTHON
3 changes: 2 additions & 1 deletion ci/docker/base.gpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
ARG BASE_IMAGE=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
FROM $BASE_IMAGE

ARG REMOTE_CACHE_URL
ARG BUILDKITE_PULL_REQUEST
Expand Down
2 changes: 1 addition & 1 deletion ci/docker/llm.build.wanda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ srcs:
- ci/env/install-llvm-binaries.sh
- ci/suppress_output
- python/requirements_compiled_rayllm_test_py311_cpu.txt
- python/requirements_compiled_rayllm_test_py311_cu121.txt
- python/requirements_compiled_rayllm_test_py311_cu128.txt
tags:
- cr.ray.io/rayproject/$IMAGE_TO
build_args:
Expand Down
2 changes: 1 addition & 1 deletion ci/docker/ray-llm.base.wanda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"]
dockerfile: docker/ray-llm/Dockerfile
srcs:
- python/requirements.txt
- python/requirements_compiled_rayllm_py311_cu124.txt
- python/requirements_compiled_rayllm_py311_cu128.txt
build_args:
- BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base
tags:
Expand Down
1 change: 0 additions & 1 deletion ci/lint/pydoclint-baseline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,6 @@ python/ray/data/_internal/execution/legacy_compat.py
--------------------
python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
DOC103: Method `ActorPoolMapOperator.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [data_context: DataContext, map_transformer: MapTransformer]. Arguments in the docstring but not in the function signature: [init_fn: , transform_fn: ].
DOC201: Method `_ActorPool.pick_actor` does not have a return section in docstring
--------------------
python/ray/data/_internal/execution/operators/base_physical_operator.py
DOC101: Method `OneToOneOperator.__init__`: Docstring contains fewer arguments than in function signature.
Expand Down
26 changes: 13 additions & 13 deletions ci/ray_ci/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,27 @@
import os
import subprocess
import sys
import re

from typing import List, Tuple, Optional


_CUDA_COPYRIGHT = """
==========
# Regex pattern to match CUDA copyright header with any version
_CUDA_COPYRIGHT_PATTERN = r"""==========
== CUDA ==
==========

CUDA Version 12.1.1
CUDA Version \d+\.\d+(?:\.\d+)?

Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Container image Copyright \(c\) 2016-2023, NVIDIA CORPORATION & AFFILIATES\. All rights reserved\.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
This container image and its contents are governed by the NVIDIA Deep Learning Container License\.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
https://developer\.nvidia\.com/ngc/nvidia-deep-learning-container-license

A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience\.
"""

A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
""" # noqa: E501
_DOCKER_ECR_REPO = os.environ.get(
"RAYCI_WORK_REPO",
"029272617770.dkr.ecr.us-west-2.amazonaws.com/rayproject/citemp",
Expand Down Expand Up @@ -64,11 +66,9 @@ def run_script_with_output(self, script: List[str]) -> str:
Run a script in container and returns output
"""
# CUDA image comes with a license header that we need to remove
return (
subprocess.check_output(self.get_run_command(script))
.decode("utf-8")
.replace(_CUDA_COPYRIGHT, "")
)
output = subprocess.check_output(self.get_run_command(script)).decode("utf-8")
# Use regex to remove CUDA copyright header with any version
return re.sub(_CUDA_COPYRIGHT_PATTERN, "", output, flags=re.MULTILINE)

def run_script(self, script: List[str]) -> None:
"""
Expand Down
6 changes: 3 additions & 3 deletions ci/test_compile_llm_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ echo "Created temporary directory: $TEMP_DIR"
# Create backup copies of req files to reference to
cp ./python/requirements_compiled_rayllm_py311_cpu.txt "$TEMP_DIR/requirements_compiled_rayllm_py311_cpu_backup.txt"
cp ./python/requirements_compiled_rayllm_py311_cu121.txt "$TEMP_DIR/requirements_compiled_rayllm_py311_cu121_backup.txt"
cp ./python/requirements_compiled_rayllm_py311_cu124.txt "$TEMP_DIR/requirements_compiled_rayllm_py311_cu124_backup.txt"
cp ./python/requirements_compiled_rayllm_py311_cu128.txt "$TEMP_DIR/requirements_compiled_rayllm_py311_cu128_backup.txt"

./ci/compile_llm_requirements.sh

# Copy files to artifact mount on Buildkite
cp ./python/requirements_compiled_rayllm_py311_cpu.txt /artifact-mount/
cp ./python/requirements_compiled_rayllm_py311_cu121.txt /artifact-mount/
cp ./python/requirements_compiled_rayllm_py311_cu124.txt /artifact-mount/
cp ./python/requirements_compiled_rayllm_py311_cu128.txt /artifact-mount/

# Check all files and print if files are not up to date
FAILED=0
for VARIANT in cpu cu121 cu124; do
for VARIANT in cpu cu121 cu128; do
diff --color -u ./python/requirements_compiled_rayllm_py311_${VARIANT}.txt "$TEMP_DIR/requirements_compiled_rayllm_py311_${VARIANT}_backup.txt" || {
echo "requirements_compiled_rayllm_py311_${VARIANT}.txt is not up to date. Please download it from Artifacts tab and git push the changes."
FAILED=1
Expand Down
4 changes: 2 additions & 2 deletions docker/ray-llm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ set -euo pipefail

PYTHON_CODE="$(python -c "import sys; v=sys.version_info; print(f'py{v.major}{v.minor}')")"

# ray-llm image only support cuda 12.4 for now.
CUDA_CODE=cu124
# ray-llm image only support cuda 12.8
CUDA_CODE=cu128

if [[ "${PYTHON_CODE}" != "py311" ]]; then
echo "ray-llm only support Python 3.11 now (this image is for ${PYTHON_CODE})."
Expand Down
5 changes: 5 additions & 0 deletions python/ray/dashboard/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@

NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode"]
GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"]

# TpuDeviceName and TpuIndex are expected to be equal to the number of TPU
# chips in the cluster. TpuType and TpuTopology are proportional to the number
# of node pools.
TPU_TAG_KEYS = NODE_TAG_KEYS + ["TpuDeviceName", "TpuIndex", "TpuType", "TpuTopology"]
CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]

Expand Down
7 changes: 2 additions & 5 deletions python/ray/dashboard/modules/reporter/gpu_profile_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
import os
import shutil
import socket
import subprocess
from datetime import datetime
from pathlib import Path
Expand Down Expand Up @@ -61,16 +60,14 @@ class GpuProfilingManager:
"GPU profiling is not available for this process."
)

def __init__(self, profile_dir_path: str):
def __init__(self, profile_dir_path: str, *, ip_address: str):
# Dump trace files to: /tmp/ray/session_latest/logs/profiles/
self._root_log_dir = Path(profile_dir_path)
self._profile_dir_path = self._root_log_dir / "profiles"
self._daemon_log_file_path = (
self._profile_dir_path / f"dynolog_daemon_{os.getpid()}.log"
)

hostname = socket.gethostname()
self._ip_address = socket.gethostbyname(hostname)
self._ip_address = ip_address

self._dynolog_bin = shutil.which("dynolog")
self._dyno_bin = shutil.which("dyno")
Expand Down
Loading