-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Open
Labels
Description
Describe the bug
failed to save checkpoint
System Info
verl version: 0.7.0
image: verlai/verl:vllm012.exp
python: 3.12.3
flash-linear-attention 0.4.0
megatron-bridge 0.3.0rc0
megatron-core 0.16.0rc0
torch 2.10.0a0+b558c986e8.nv25.11
vllm 0.12.0+cu130 /opt/vllm
21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): �[36mray::WorkerDict.actor_rollout_save_checkpoint()�[39m (pid=3335930, ip=172.16.1.82, actor_id=0acd555073b76bb0978d0b6102000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fa6a2a19b80>)
-- | --
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/lib/python3.12/concurrent/futures/_base.py", line 456, in result
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m return self.__get_result()
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m raise self._exception
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m return getattr(self.worker_dict[key], name)(*args, **kwargs)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/single_controller/ray/base.py", line 841, in func
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/single_controller/base/decorator.py", line 456, in inner
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m return func(*args, **kwargs)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/utils/transferqueue_utils.py", line 314, in dummy_inner
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m output = func(*args, **kwargs)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/workers/megatron_workers.py", line 913, in save_checkpoint
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m self.checkpoint_mananager.save_checkpoint(
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/utils/checkpoint/megatron_checkpoint_manager.py", line 462, in save_checkpoint
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m state_dict = self.generate_state_dict(
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/tmp/ray/session_2026-01-14_21-11-56_359831_3660637/runtime_resources/working_dir_files/_ray_pkg_b9c1a93bdfb45447/verl/utils/checkpoint/megatron_checkpoint_manager.py", line 269, in generate_state_dict
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/optimizer/optimizer.py", line 1208, in sharded_state_dict
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m optimizer_sharded_states = self.optimizer.sharded_state_dict(state_dict, is_loading=is_loading)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/optimizer/distrib_optimizer.py", line 1271, in sharded_state_dict
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m optim_state_dict = optimizer.sharded_state_dict(
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m param_state = self.sharded_param_state_fs_model_space(
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/optimizer/distrib_optimizer.py", line 1745, in sharded_param_state_fs_model_space
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/optimizer/distrib_optimizer.py", line 1733, in _get_param_state_sharded_tensors
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m tensors = _get_param_state_sharded_tensors(
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m tensors[state_key] = replace(sharded_metadata, **replace_kwargs)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "<string>", line 14, in __init__
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/lib/python3.12/dataclasses.py", line 1581, in replace
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m return obj.__class__(**changes)
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m self.validate_metadata_integrity()
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/dist_checkpointing/mapping.py", line 134, in validate_metadata_integrity
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m File "/usr/local/lib/python3.12/dist-packages/megatron/core/dist_checkpointing/mapping.py", line 94, in __post_init__
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m megatron.core.dist_checkpointing.core.CheckpointingException: ShardedTensor.flattened_range is not supported.
| 21:33:46.932 | �[36m(TaskRunner pid=3745472)�[0m raise CheckpointingException("ShardedTensor.flattened_range is not supported.")
Steps/Code to reproduce bug
#!/usr/bin/env bash
set -xeuo pipefail
# pip install langdetect immutabledict 'nltk>=3.9.1' rouge jieba -i https://mirrors-ssl.aliyuncs.com/pypi/simple
# pip uninstall megatron-core -y
# pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@dev # install megatron from dev branch
export NCCL_DEBUG=WARN
export VLLM_USE_V1=1
export HYDRA_FULL_ERROR=1
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# export VERL_LOGGING_LEVEL=DEBUG
export TIKTOKEN_ENCODINGS_BASE=${PWD}/tiktoken_encodings
export NLTK_DATA=nltk_data
python -c "import nltk; nltk.download('punkt_tab')"
adv_estimator=grpo
use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=True
kl_loss_coef=0.001
clip_ratio_low=3e-4
clip_ratio_high=4e-4
max_prompt_length=$((1024 * 1))
max_response_length=$((1024 * 1))
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 1))
overlong_penalty_factor=1.0
loss_agg_mode="token-mean"
loss_mode=gspo
lr=5e-7
train_prompt_bsz=32
n_resp_per_prompt=16
train_prompt_mini_bsz=32
# ref infer
infer_micro_batch_size_per_gpu=2
# Ray
# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${SLURM_JOB_NUM_NODES:-1}
NGPUS_PER_NODE=${SLURM_GPUS_PER_NODE:-8}
echo "NNODES: $NNODES"
echo "NGPUS_PER_NODE: $NGPUS_PER_NODE"
# Paths
# RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
# MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"}
# CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
# TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
# TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=0.7
# Performance Related Parameter
use_dynamic_bsz=False
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
offload=True
# gen
rollout_name=vllm # vllm or sglang
dtype="bfloat16" # ["bfloat16", "float16"]
rollout_mode=async
gen_tp=4
gen_dp=1
gen_ep=1 # gen_ep = gen_tp * gen_dp
# train
train_tp=1
train_pp=2
EP=16
ETP=1
dataset_name=$1
project_name='DAPO'
exp_name="GSPO-qwen3-next-${dataset_name}
TENSORBOARD_DIR=tensorboard_logs/verl/${project_name}/${dataset_name}/${exp_name}
MODEL_PATH=qwen/Qwen3-Next-80B-A3B-Instruct
CKPTS_DIR=verl/${project_name}/${dataset_name}/${exp_name}
TRAIN_FILE=data/grpo/mix_train_v6.parquet
TEST_FILE=data/grpo/mix_test_v6.parquet
# 压缩多行 JSON 为单行(Hydra 解析器需要)
TRAIN_FILE_COMPACT=$(echo "${TRAIN_FILE}" | tr -d '\n' | sed 's/ */ /g' | sed 's/\[ /[/g' | sed 's/ \]/]/g')
TEST_FILE_COMPACT=$(echo "${TEST_FILE}" | tr -d '\n' | sed 's/ */ /g' | sed 's/\[ /[/g' | sed 's/ \]/]/g')
################################################### start of config ###################################################
GPT_OSS_CONFIG=(
# only support mbridge for gptoss
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=False
# for now (latest TE=2.10), gptoss's optimized attn kernel is not supported for thd format, so we use bshd format here
# when bshd format is used, we need to pad the input_ids to the longest sequence length
# so we recommend to disable dynamic batch size and set micro batch size to 1 to avoid paddings
# but it is ok to try with micro_batch_size>1
actor_rollout_ref.actor.megatron.use_remove_padding=False
)
DATA=(
data.train_files="${TRAIN_FILE_COMPACT}"
data.val_files="${TEST_FILE_COMPACT}"
data.prompt_key=prompt
data.return_raw_chat=True
data.truncation='left'
data.max_prompt_length=${max_prompt_length}
data.max_response_length=${max_response_length}
data.train_batch_size=${train_prompt_bsz}
)
REWARD_MODEL=(
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer}
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len}
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor}
+reward_model.reward_kwargs.overlong_buffer_cfg.log=True
+reward_model.reward_kwargs.max_resp_len=${max_response_length}
reward_model.reward_manager=xp_reward
)
PERF_OPT=(
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True
actor_rollout_ref.model.use_fused_kernels=False
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
actor_rollout_ref.actor.optim.use_checkpoint_opt_param_scheduler=True
)
ACTOR=(
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low}
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high}
actor_rollout_ref.actor.clip_ratio_c=10.0
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.optim.lr_warmup_steps=10
actor_rollout_ref.actor.optim.weight_decay=0.1
actor_rollout_ref.actor.optim.clip_grad=1.0
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
actor_rollout_ref.actor.megatron.param_offload=${offload}
actor_rollout_ref.actor.megatron.optimizer_offload=${offload}
actor_rollout_ref.actor.megatron.grad_offload=${offload}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
actor_rollout_ref.model.use_remove_padding=False
actor_rollout_ref.actor.megatron.sequence_parallel=False
)
REF=(
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${offload}
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_micro_batch_size_per_gpu}
actor_rollout_ref.ref.megatron.sequence_parallel=False
)
ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.mode=${rollout_mode}
actor_rollout_ref.rollout.dtype=${dtype}
actor_rollout_ref.rollout.gpu_memory_utilization=0.70
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
actor_rollout_ref.rollout.enable_chunked_prefill=True
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length))
actor_rollout_ref.rollout.temperature=${temperature}
actor_rollout_ref.rollout.top_p=${top_p}
actor_rollout_ref.rollout.top_k=${top_k}
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature}
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p}
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k}
actor_rollout_ref.rollout.val_kwargs.do_sample=True
actor_rollout_ref.rollout.val_kwargs.n=1
actor_rollout_ref.rollout.calculate_log_probs=True
actor_rollout_ref.rollout.n=${n_resp_per_prompt}
actor_rollout_ref.rollout.enforce_eager=True
actor_rollout_ref.rollout.free_cache_engine=True
)
TRAINER=(
trainer.logger=['console','tensorboard']
trainer.project_name="${project_name}"
trainer.experiment_name="${exp_name}"
trainer.n_gpus_per_node="${NGPUS_PER_NODE}"
trainer.nnodes="${NNODES}"
trainer.val_before_train=False
trainer.test_freq=10
trainer.save_freq=10
trainer.total_epochs=10
trainer.default_local_dir="${CKPTS_DIR}"
trainer.resume_mode=auto
trainer.log_val_generations=10
trainer.rollout_data_dir="${CKPTS_DIR}/rollout_data"
+trainer.validation_data_dir="${CKPTS_DIR}/validation_data"
)
FORWARD_ONLY_SETS=(
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
)
MODEL=(
actor_rollout_ref.model.path="${MODEL_PATH}"
)
ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=${use_kl_in_reward}
algorithm.kl_ctrl.kl_coef=${kl_coef}
)
################################################### start of Ray ###################################################
MASTER_IP=$(getent hosts ${MASTER_ADDR} | awk '{print $1}')
RAY_NODE_PORT=6379
if [ "$NODE_RANK" == "0" ]; then
echo "Starting Ray head node"
ray start --head --port=$RAY_NODE_PORT
echo "Starting Ray head node done"
sleep 30
else
sleep 10
echo "Starting Ray worker node $NODE_RANK"
ray start --address="${MASTER_IP}:${RAY_NODE_PORT}"
fi
sleep 15
rm -rf /code/fuyao_agent
rm -rf /code/smart-v*
rm -rf /code/c4d
ray status
echo "[${RANK}] Ray started. Launching training or keeping node alive ..."
if [ "$NODE_RANK" == "0" ]; then
# 如果local_dir不存在,则创建
mkdir -p ${CKPTS_DIR}
# 复制当前脚本到local_dir
cp $0 ${CKPTS_DIR}
ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json='{"working_dir": "/code", "excludes": [], "env_vars": {"TENSORBOARD_DIR": "'${TENSORBOARD_DIR}'", "CUDA_DEVICE_MAX_CONNECTIONS": "1", "VLLM_WORKER_MULTIPROC_METHOD": "spawn"}}' \
-- \
python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer_custom_v3.yaml' \
"${DATA[@]}" \
"${ALGORITHM[@]}" \
"${MODEL[@]}" \
"${REF[@]}" \
"${ROLLOUT[@]}" \
"${ACTOR[@]}" \
"${REWARD_MODEL[@]}" \
"${PERF_OPT[@]}" \
"${TRAINER[@]}" \
"${GPT_OSS_CONFIG[@]}" \
"${FORWARD_ONLY_SETS[@]}" \
else
while true; do
# 获取当前时间戳并写入日志
echo "$(date '+%Y-%m-%d %H:%M:%S') - Logging message"
# 等待 10 分钟
sleep 600
done
fi
**Expected behavior**
A clear and concise description of what you expected to happen.
**Additional context**
Add any other context about the problem here.